diff options
Diffstat (limited to 'net')
81 files changed, 2849 insertions, 1378 deletions
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index f4078830ea50..0c423b8cd75c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -12,7 +12,7 @@ #include <linux/sched/signal.h> static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage) + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { u32 ret; @@ -28,13 +28,20 @@ static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) { - struct bpf_cgroup_storage *storage = NULL; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; + enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; u32 ret = 0, i; - storage = bpf_cgroup_storage_alloc(prog); - if (IS_ERR(storage)) - return PTR_ERR(storage); + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } if (!repeat) repeat = 1; @@ -53,7 +60,8 @@ static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; - bpf_cgroup_storage_free(storage); + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); return ret; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a4a848bf827b..a7ea2d431714 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -162,6 +162,29 @@ out: return err; } +static int br_mdb_valid_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct br_port_msg *bpm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for mdb dump request"); + return -EINVAL; + } + + bpm = nlmsg_data(nlh); + if (bpm->ifindex) { + NL_SET_ERR_MSG_MOD(extack, "Filtering by device index is not supported for mdb dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(nlh, sizeof(*bpm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request"); + return -EINVAL; + } + + return 0; +} + static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; @@ -169,6 +192,13 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) struct nlmsghdr *nlh = NULL; int idx = 0, s_idx; + if (cb->strict_check) { + int err = br_mdb_valid_dump_req(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_idx = cb->args[0]; rcu_read_lock(); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e5a5bc5d5232..3345f1984542 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1034,6 +1034,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1114,6 +1115,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { + __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); + + err = br_vlan_set_stats_per_port(br, per_port); + if (err) + return err; + } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { @@ -1327,6 +1336,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ @@ -1417,7 +1427,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, - br_opt_get(br, BROPT_VLAN_STATS_ENABLED))) + br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || + nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, + br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 57229b9d800f..10ee39fdca5c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -320,6 +320,7 @@ enum net_bridge_opts { BROPT_HAS_IPV6_ADDR, BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, + BROPT_VLAN_STATS_PER_PORT, }; struct net_bridge { @@ -859,6 +860,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); +int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c93c5724609e..60182bef6341 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -803,6 +803,22 @@ static ssize_t vlan_stats_enabled_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_stats); } static DEVICE_ATTR_RW(vlan_stats_enabled); + +static ssize_t vlan_stats_per_port_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); +} + +static ssize_t vlan_stats_per_port_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port); +} +static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { @@ -856,6 +872,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, + &dev_attr_vlan_stats_per_port.attr, #endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 5942e03dd845..9b707234e4ae 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -190,6 +190,19 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) } } +static void nbp_vlan_rcu_free(struct rcu_head *rcu) +{ + struct net_bridge_vlan *v; + + v = container_of(rcu, struct net_bridge_vlan, rcu); + WARN_ON(br_vlan_is_master(v)); + /* if we had per-port stats configured then free them here */ + if (v->brvlan->stats != v->stats) + free_percpu(v->stats); + v->stats = NULL; + kfree(v); +} + /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: @@ -245,7 +258,15 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (!masterv) goto out_filt; v->brvlan = masterv; - v->stats = masterv->stats; + if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { + v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + if (!v->stats) { + err = -ENOMEM; + goto out_filt; + } + } else { + v->stats = masterv->stats; + } } else { err = br_switchdev_port_vlan_add(dev, v->vid, flags); if (err && err != -EOPNOTSUPP) @@ -329,7 +350,7 @@ static int __vlan_del(struct net_bridge_vlan *v) rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); - kfree_rcu(v, rcu); + call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); @@ -830,6 +851,30 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val) return 0; } +int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) +{ + struct net_bridge_port *p; + + /* allow to change the option if there are no port vlans configured */ + list_for_each_entry(p, &br->port_list, list) { + struct net_bridge_vlan_group *vg = nbp_vlan_group(p); + + if (vg->num_vlans) + return -EBUSY; + } + + switch (val) { + case 0: + case 1: + br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); + break; + default: + return -EINVAL; + } + + return 0; +} + static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; diff --git a/net/core/devlink.c b/net/core/devlink.c index 938f68ee92f0..6dae81d65d5c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3504,7 +3504,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, start_offset = *((u64 *)&cb->args[0]); err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize, - attrs, DEVLINK_ATTR_MAX, ops->policy, NULL); + attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack); if (err) goto out; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3144ef2bf136..4cc603dfc9ef 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -27,6 +27,7 @@ #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> +#include <net/xdp_sock.h> /* * Some useful ethtool_ops methods that're device independent. @@ -1662,8 +1663,10 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev, static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { - struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; + struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; + u16 from_channel, to_channel; u32 max_rx_in_use = 0; + unsigned int i; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; @@ -1671,13 +1674,13 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; - dev->ethtool_ops->get_channels(dev, &max); + dev->ethtool_ops->get_channels(dev, &curr); /* ensure new counts are within the maximums */ - if ((channels.rx_count > max.max_rx) || - (channels.tx_count > max.max_tx) || - (channels.combined_count > max.max_combined) || - (channels.other_count > max.max_other)) + if (channels.rx_count > curr.max_rx || + channels.tx_count > curr.max_tx || + channels.combined_count > curr.max_combined || + channels.other_count > curr.max_other) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow @@ -1687,6 +1690,14 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, (channels.combined_count + channels.rx_count) <= max_rx_in_use) return -EINVAL; + /* Disabling channels, query zero-copy AF_XDP sockets */ + from_channel = channels.combined_count + + min(channels.rx_count, channels.tx_count); + to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); + for (i = from_channel; i < to_channel; i++) + if (xdp_get_umem_from_qid(dev, i)) + return -EINVAL; + return dev->ethtool_ops->set_channels(dev, &channels); } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 0ff3953f64aa..ffbb827723a2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1063,13 +1063,47 @@ skip: return err; } +static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct fib_rule_hdr *frh; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); + return -EINVAL; + } + + frh = nlmsg_data(nlh); + if (frh->dst_len || frh->src_len || frh->tos || frh->table || + frh->res1 || frh->res2 || frh->action || frh->flags) { + NL_SET_ERR_MSG(extack, + "Invalid values in header for fib rule dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request"); + return -EINVAL; + } + + return 0; +} + static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct fib_rules_ops *ops; int idx = 0, family; - family = rtnl_msg_family(cb->nlh); + if (cb->strict_check) { + int err = fib_valid_dumprule_req(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = rtnl_msg_family(nlh); if (family != AF_UNSPEC) { /* Protocol specific dump request */ ops = lookup_rules_ops(net, family); diff --git a/net/core/filter.c b/net/core/filter.c index 72db8afb7cb6..4bbc6567fcb8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -58,13 +58,17 @@ #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> +#include <net/udp.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> #include <net/ip_fib.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> +#include <net/net_namespace.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> @@ -4813,6 +4817,143 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +#ifdef CONFIG_INET +static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) +{ + int dif = skb->dev->ifindex; + bool refcounted = false; + struct sock *sk = NULL; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + int sdif = inet_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, skb); +#if IS_REACHABLE(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + int sdif = inet6_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &refcounted); + else + sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &udp_table, skb); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_sk_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + * Returns the socket as an 'unsigned long' to simplify the casting in the + * callers to satisfy BPF_CALL declarations. + */ +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + struct sock *sk = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + goto out; + + if (skb->dev) + caller_net = dev_net(skb->dev); + else + caller_net = sock_net(skb->sk); + if (netns_id) { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, skb, family, proto); + put_net(net); + } else { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } + + if (sk) + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; +#endif /* CONFIG_INET */ + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -5019,6 +5160,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5119,6 +5268,14 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; +#ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -5394,23 +5551,29 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) +bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_attach_type(off, type, - prog->expected_attach_type)) - return false; if (!__sock_filter_check_size(off, size, info)) return false; return true; } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (!bpf_sock_is_valid_access(off, size, type, info)) + return false; + return __sock_filter_check_attach_type(off, type, + prog->expected_attach_type); +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -6122,10 +6285,10 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - const struct bpf_insn *si, - struct bpf_insn *insn_buf, - struct bpf_prog *prog, u32 *target_size) +u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; @@ -7037,7 +7200,7 @@ const struct bpf_prog_ops lwt_seg6local_prog_ops = { const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, - .convert_ctx_access = sock_filter_convert_ctx_access, + .convert_ctx_access = bpf_sock_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index fb023df48b83..69c41cb3966d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -232,7 +232,8 @@ static void pneigh_queue_purge(struct sk_buff_head *list) } } -static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) +static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { int i; struct neigh_hash_table *nht; @@ -250,6 +251,10 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) np = &n->next; continue; } + if (skip_perm && n->nud_state & NUD_PERMANENT) { + np = &n->next; + continue; + } rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); @@ -285,21 +290,35 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); + neigh_flush_dev(tbl, dev, false); write_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); -int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { write_lock_bh(&tbl->lock); - neigh_flush_dev(tbl, dev); + neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); return 0; } + +int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, true); + return 0; +} +EXPORT_SYMBOL(neigh_carrier_down); + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + __neigh_ifdown(tbl, dev, false); + return 0; +} EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) @@ -2164,15 +2183,47 @@ errout: return err; } +static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ndtmsg *ndtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); + return -EINVAL; + } + + ndtm = nlmsg_data(nlh); + if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ndtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request"); + return -EINVAL; + } + + return 0; +} + static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int family, tidx, nidx = 0; int tbl_skip = cb->args[0]; int neigh_skip = cb->args[1]; struct neigh_table *tbl; - family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; + if (cb->strict_check) { + int err = neightbl_valid_dump_info(nlh, cb->extack); + + if (err < 0) + return err; + } + + family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; @@ -2185,7 +2236,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) continue; if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) break; @@ -2200,7 +2251,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) if (neightbl_fill_param_info(skb, tbl, p, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNEIGHTBL, NLM_F_MULTI) < 0) goto out; @@ -2426,11 +2477,73 @@ out: } +static int neigh_valid_dump_req(const struct nlmsghdr *nlh, + bool strict_check, + struct neigh_dump_filter *filter, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + int err, i; + + if (strict_check) { + struct ndmsg *ndm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || + ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + } else { + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + } + if (err < 0) + return err; + + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + /* all new attributes should require strict_check */ + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in neighbor dump request"); + return -EINVAL; + } + filter->dev_idx = nla_get_u32(tb[i]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in neighbor dump request"); + return -EINVAL; + } + filter->master_idx = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request"); + return -EINVAL; + } + } + } + + return 0; +} + static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct neigh_dump_filter filter = {}; - struct nlattr *tb[NDA_MAX + 1]; struct neigh_table *tbl; int t, family, s_t; int proxy = 0; @@ -2445,19 +2558,10 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY) proxy = 1; - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); - if (!err) { - if (tb[NDA_IFINDEX]) { - if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) - return -EINVAL; - filter.dev_idx = nla_get_u32(tb[NDA_IFINDEX]); - } - if (tb[NDA_MASTER]) { - if (nla_len(tb[NDA_MASTER]) != sizeof(u32)) - return -EINVAL; - filter.master_idx = nla_get_u32(tb[NDA_MASTER]); - } - } + err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); + if (err < 0 && cb->strict_check) + return err; + s_t = cb->args[0]; for (t = 0; t < NEIGH_NR_TABLES; t++) { diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 670c84b1bfc2..fefe72774aeb 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -853,6 +853,12 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .s_idx = cb->args[0], }; + if (cb->strict_check && + nlmsg_attrlen(cb->nlh, sizeof(struct rtgenmsg))) { + NL_SET_ERR_MSG(cb->extack, "Unknown data in network namespace id dump request"); + return -EINVAL; + } + spin_lock_bh(&net->nsid_lock); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); spin_unlock_bh(&net->nsid_lock); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5564eee1e980..0958c7be2c22 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include <net/rtnetlink.h> #include <net/net_namespace.h> -#define RTNL_MAX_TYPE 48 +#define RTNL_MAX_TYPE 49 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { @@ -1878,8 +1878,52 @@ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); +static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, + bool strict_check, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + int hdrlen; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change) { + NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); + return -EINVAL; + } + if (ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); + return -EINVAL; + } + + return nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, + ifla_policy, extack); + } + + /* A hack to preserve kernel<->userspace interface. + * The correct header is ifinfomsg. It is consistent with rtnl_getlink. + * However, before Linux v3.9 the code here assumed rtgenmsg and that's + * what iproute2 < v3.9.0 used. + * We can detect the old iproute2. Even including the IFLA_EXT_MASK + * attribute, its netlink message is shorter than struct ifinfomsg. + */ + hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? + sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + + return nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); +} + static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + struct netlink_ext_ack *extack = cb->extack; + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; int h, s_h; @@ -1892,44 +1936,54 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) unsigned int flags = NLM_F_MULTI; int master_idx = 0; int netnsid = -1; - int err; - int hdrlen; + int err, i; s_h = cb->args[0]; s_idx = cb->args[1]; - /* A hack to preserve kernel<->userspace interface. - * The correct header is ifinfomsg. It is consistent with rtnl_getlink. - * However, before Linux v3.9 the code here assumed rtgenmsg and that's - * what iproute2 < v3.9.0 used. - * We can detect the old iproute2. Even including the IFLA_EXT_MASK - * attribute, its netlink message is shorter than struct ifinfomsg. - */ - hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? - sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); + err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); + if (err < 0) { + if (cb->strict_check) + return err; + + goto walk_entries; + } + + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; - if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, - ifla_policy, NULL) >= 0) { - if (tb[IFLA_TARGET_NETNSID]) { - netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); + /* new attributes should only be added with strict checking */ + switch (i) { + case IFLA_TARGET_NETNSID: + netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); - if (IS_ERR(tgt_net)) + if (IS_ERR(tgt_net)) { + NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(tgt_net); + } + break; + case IFLA_EXT_MASK: + ext_filter_mask = nla_get_u32(tb[i]); + break; + case IFLA_MASTER: + master_idx = nla_get_u32(tb[i]); + break; + case IFLA_LINKINFO: + kind_ops = linkinfo_to_kind_ops(tb[i]); + break; + default: + if (cb->strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); + return -EINVAL; + } } - - if (tb[IFLA_EXT_MASK]) - ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); - - if (tb[IFLA_MASTER]) - master_idx = nla_get_u32(tb[IFLA_MASTER]); - - if (tb[IFLA_LINKINFO]) - kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]); - - if (master_idx || kind_ops) - flags |= NLM_F_DUMP_FILTERED; } + if (master_idx || kind_ops) + flags |= NLM_F_DUMP_FILTERED; + +walk_entries: for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; @@ -1941,8 +1995,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, - flags, + nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid); @@ -3746,22 +3799,66 @@ out: } EXPORT_SYMBOL(ndo_dflt_fdb_dump); -static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[NDA_MAX + 1]; + struct ndmsg *ndm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); + return -EINVAL; + } + + ndm = nlmsg_data(nlh); + if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || + ndm->ndm_flags || ndm->ndm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for fbd dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, + NULL, extack); + if (err < 0) + return err; + + *brport_idx = ndm->ndm_ifindex; + for (i = 0; i <= NDA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case NDA_IFINDEX: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); + return -EINVAL; + } + *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); + break; + case NDA_MASTER: + if (nla_len(tb[i]) != sizeof(u32)) { + NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); + return -EINVAL; + } + *br_idx = nla_get_u32(tb[NDA_MASTER]); + break; + default: + NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); + return -EINVAL; + } + } + + return 0; +} + +static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, + int *br_idx, int *brport_idx, + struct netlink_ext_ack *extack) { - struct net_device *dev; struct nlattr *tb[IFLA_MAX+1]; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; - struct ifinfomsg *ifm = nlmsg_data(cb->nlh); - struct net *net = sock_net(skb->sk); - struct hlist_head *head; - int brport_idx = 0; - int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; - int fidx = 0; + int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. @@ -3770,20 +3867,49 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ - if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) && - (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) + + if (nlmsg_len(nlh) != sizeof(struct ndmsg) && + (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { - err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, - IFLA_MAX, ifla_policy, NULL); + struct ifinfomsg *ifm; + + err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, + ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) - br_idx = nla_get_u32(tb[IFLA_MASTER]); + *br_idx = nla_get_u32(tb[IFLA_MASTER]); } - brport_idx = ifm->ifi_index; + ifm = nlmsg_data(nlh); + *brport_idx = ifm->ifi_index; } + return 0; +} + +static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net_device *dev; + struct net_device *br_dev = NULL; + const struct net_device_ops *ops = NULL; + const struct net_device_ops *cops = NULL; + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + int brport_idx = 0; + int br_idx = 0; + int h, s_h; + int idx = 0, s_idx; + int err = 0; + int fidx = 0; + + if (cb->strict_check) + err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, + cb->extack); + else + err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, + cb->extack); + if (err < 0) + return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); @@ -3968,28 +4094,72 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); +static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, + bool strict_check, u32 *filter_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX+1]; + int err, i; + + if (strict_check) { + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + } else { + err = nlmsg_parse(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + } + if (err < 0) + return err; + + /* new attributes should only be added with strict checking */ + for (i = 0; i <= IFLA_MAX; ++i) { + if (!tb[i]) + continue; + + switch (i) { + case IFLA_EXT_MASK: + *filter_mask = nla_get_u32(tb[i]); + break; + default: + if (strict_check) { + NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); + return -EINVAL; + } + } + } + + return 0; +} + static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; - u32 seq = cb->nlh->nlmsg_seq; + u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; - if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { - struct nlattr *extfilt; - - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), - IFLA_EXT_MASK); - if (extfilt) { - if (nla_len(extfilt) < sizeof(filter_mask)) - return -EINVAL; - - filter_mask = nla_get_u32(extfilt); - } - } + err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, + cb->extack); + if (err < 0 && cb->strict_check) + return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { @@ -4583,6 +4753,7 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -4599,13 +4770,32 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; - if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) + if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) { + NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; + } ifsm = nlmsg_data(cb->nlh); + + /* only requests using strict checks can pass data to influence + * the dump. The legacy exception is filter_mask. + */ + if (cb->strict_check) { + if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) { + NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); + return -EINVAL; + } + if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) { + NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); + return -EINVAL; + } + } + filter_mask = ifsm->filter_mask; - if (!filter_mask) + if (!filter_mask) { + NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; + } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index e90c89ef8c08..850a6f13a082 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); + if (!netif_carrier_ok(dev)) + neigh_carrier_down(&arp_tbl, dev); break; default: break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 44d931a3cd50..d122ebbe5980 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -782,7 +782,8 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, } static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, - __u32 *pvalid_lft, __u32 *pprefered_lft) + __u32 *pvalid_lft, __u32 *pprefered_lft, + struct netlink_ext_ack *extack) { struct nlattr *tb[IFA_MAX+1]; struct in_ifaddr *ifa; @@ -792,7 +793,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, int err; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, - NULL); + extack); if (err < 0) goto errout; @@ -897,7 +898,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft); + ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -1659,17 +1660,70 @@ nla_put_failure: return -EMSGSIZE; } +static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, + struct inet_fill_args *fillargs, + struct net **tgt_net, struct sock *sk, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFA_MAX+1]; + struct ifaddrmsg *ifm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); + return -EINVAL; + } + if (ifm->ifa_index) { + NL_SET_ERR_MSG(extack, "ipv4: Filter by device index not supported for address dump"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv4_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= IFA_MAX; ++i) { + if (!tb[i]) + continue; + + if (i == IFA_TARGET_NETNSID) { + struct net *net; + + fillargs->netnsid = nla_get_s32(tb[i]); + + net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); + if (IS_ERR(net)) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); + return PTR_ERR(net); + } + *tgt_net = net; + } else { + NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, - .seq = cb->nlh->nlmsg_seq, + .seq = nlh->nlmsg_seq, .event = RTM_NEWADDR, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); - struct nlattr *tb[IFA_MAX+1]; struct net *tgt_net = net; int h, s_h; int idx, s_idx; @@ -1683,16 +1737,13 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; - if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX, - ifa_ipv4_policy, NULL) >= 0) { - if (tb[IFA_TARGET_NETNSID]) { - fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + if (cb->strict_check) { + int err; - tgt_net = rtnl_get_net_ns_capable(skb->sk, - fillargs.netnsid); - if (IS_ERR(tgt_net)) - return PTR_ERR(tgt_net); - } + err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, + skb->sk, cb->extack); + if (err < 0) + return err; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { @@ -2035,6 +2086,7 @@ errout: static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int h, s_h; int idx, s_idx; @@ -2042,6 +2094,21 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, struct in_device *in_dev; struct hlist_head *head; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -2061,7 +2128,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, if (inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -2078,7 +2145,7 @@ cont: if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -2089,7 +2156,7 @@ cont: if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 30e2bcc3ef2a..038f511c73fa 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -802,8 +802,40 @@ errout: return err; } +int ip_valid_fib_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); + return -EINVAL; + } + + rtm = nlmsg_data(nlh); + if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || + rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || + rtm->rtm_type) { + NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); + return -EINVAL; + } + if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { + NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*rtm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in FIB dump request"); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); + static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; @@ -811,8 +843,14 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; int dumped = 0, err; - if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && - ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) + if (cb->strict_check) { + err = ip_valid_fib_dump_req(nlh, cb->extack); + if (err < 0) + return err; + } + + if (nlmsg_len(nlh) >= sizeof(struct rtmsg) && + ((struct rtmsg *)nlmsg_data(nlh))->rtm_flags & RTM_F_CLONED) return skb->len; s_h = cb->args[0]; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5660adcf7a04..91b0d5671649 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2527,6 +2527,13 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { + if (cb->strict_check) { + int err = ip_valid_fib_dump_req(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, _ipmr_fill_mroute, &mfc_unres_lock); } @@ -2710,6 +2717,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) return true; } +static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifm))) { + NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); + return -EINVAL; + } + + return 0; +} + static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -2718,6 +2750,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) unsigned int e = 0, s_e; struct mr_table *mrt; + if (cb->strict_check) { + int err = ipmr_valid_dumplink(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_t = cb->args[0]; s_e = cb->args[1]; diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 6115bf1ff6f0..78a67f961d86 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -264,7 +264,6 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, return nf_nat_inet_fn(priv, skb, state); } -EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn); static unsigned int nf_nat_ipv4_in(void *priv, struct sk_buff *skb, diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index ad3aeff152ed..a9d5e013e555 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -104,12 +104,26 @@ static int masq_device_event(struct notifier_block *this, return NOTIFY_DONE; } +static int inet_cmp(struct nf_conn *ct, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct nf_conntrack_tuple *tuple; + + if (!device_cmp(ct, (void *)(long)dev->ifindex)) + return 0; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + return ifa->ifa_address == tuple->dst.u3.ip; +} + static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct netdev_notifier_info info; + struct net *net = dev_net(idev->dev); /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have @@ -119,8 +133,10 @@ static int masq_inet_event(struct notifier_block *this, if (idev->dead) return NOTIFY_DONE; - netdev_notifier_info_init(&info, idev->dev); - return masq_device_event(this, event, &info); + if (event == NETDEV_DOWN) + nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + + return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index ca61e2a659e7..cd4814f7e962 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -44,6 +44,7 @@ #include <linux/mm.h> #include <net/tcp.h> #include <linux/inet_diag.h> +#include "tcp_dctcp.h" #define DCTCP_MAX_ALPHA 1024U @@ -118,54 +119,6 @@ static u32 dctcp_ssthresh(struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -/* Minimal DCTP CE state machine: - * - * S: 0 <- last pkt was non-CE - * 1 <- last pkt was CE - */ - -static void dctcp_ce_state_0_to_1(struct sock *sk) -{ - struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (!ca->ce_state) { - /* State has changed from CE=0 to CE=1, force an immediate - * ACK to reflect the new CE state. If an ACK was delayed, - * send that first to reflect the prior CE state. - */ - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } - - ca->prior_rcv_nxt = tp->rcv_nxt; - ca->ce_state = 1; - - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; -} - -static void dctcp_ce_state_1_to_0(struct sock *sk) -{ - struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (ca->ce_state) { - /* State has changed from CE=1 to CE=0, force an immediate - * ACK to reflect the new CE state. If an ACK was delayed, - * send that first to reflect the prior CE state. - */ - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } - - ca->prior_rcv_nxt = tp->rcv_nxt; - ca->ce_state = 0; - - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; -} - static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); @@ -230,12 +183,12 @@ static void dctcp_state(struct sock *sk, u8 new_state) static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { + struct dctcp *ca = inet_csk_ca(sk); + switch (ev) { case CA_EVENT_ECN_IS_CE: - dctcp_ce_state_0_to_1(sk); - break; case CA_EVENT_ECN_NO_CE: - dctcp_ce_state_1_to_0(sk); + dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; default: /* Don't care for the rest. */ diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h new file mode 100644 index 000000000000..d69a77cbd0c7 --- /dev/null +++ b/net/ipv4/tcp_dctcp.h @@ -0,0 +1,40 @@ +#ifndef _TCP_DCTCP_H +#define _TCP_DCTCP_H + +static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (ce_state == 1) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + else + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ +static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + u32 *prior_rcv_nxt, u32 *ce_state) +{ + u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; + + if (*ce_state != new_ce_state) { + /* CE state has changed, force an immediate ACK to + * reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + dctcp_ece_ack_cwr(sk, *ce_state); + __tcp_send_ack(sk, *prior_rcv_nxt); + } + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } + *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; + *ce_state = new_ce_state; + dctcp_ece_ack_cwr(sk, new_ce_state); +} + +#endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a9a317322388..2496b12bf721 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -666,6 +666,7 @@ errout: static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); int h, s_h; int idx, s_idx; @@ -673,6 +674,21 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, struct inet6_dev *idev; struct hlist_head *head; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -692,7 +708,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb, if (inet6_netconf_fill_devconf(skb, dev->ifindex, &idev->cnf, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -709,7 +725,7 @@ cont: if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -720,7 +736,7 @@ cont: if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) goto done; @@ -4793,12 +4809,19 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + struct inet6_fill_args { u32 portid; u32 seq; int event; unsigned int flags; int netnsid; + enum addr_type_t type; }; static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, @@ -4930,39 +4953,28 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, return 0; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - /* called with rcu_read_lock() */ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, - struct netlink_callback *cb, enum addr_type_t type, - int s_ip_idx, int *p_ip_idx, int netnsid) + struct netlink_callback *cb, + int s_ip_idx, int *p_ip_idx, + struct inet6_fill_args *fillargs) { - struct inet6_fill_args fillargs = { - .portid = NETLINK_CB(cb->skb).portid, - .seq = cb->nlh->nlmsg_seq, - .flags = NLM_F_MULTI, - .netnsid = netnsid, - }; struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; int err = 1; int ip_idx = *p_ip_idx; read_lock_bh(&idev->lock); - switch (type) { + switch (fillargs->type) { case UNICAST_ADDR: { struct inet6_ifaddr *ifa; - fillargs.event = RTM_NEWADDR; + fillargs->event = RTM_NEWADDR; /* unicast address incl. temp addr */ list_for_each_entry(ifa, &idev->addr_list, if_list) { if (++ip_idx < s_ip_idx) continue; - err = inet6_fill_ifaddr(skb, ifa, &fillargs); + err = inet6_fill_ifaddr(skb, ifa, fillargs); if (err < 0) break; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -4970,26 +4982,26 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, break; } case MULTICAST_ADDR: - fillargs.event = RTM_GETMULTICAST; + fillargs->event = RTM_GETMULTICAST; /* multicast address */ for (ifmca = idev->mc_list; ifmca; ifmca = ifmca->next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); + err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); if (err < 0) break; } break; case ANYCAST_ADDR: - fillargs.event = RTM_GETANYCAST; + fillargs->event = RTM_GETANYCAST; /* anycast address */ for (ifaca = idev->ac_list; ifaca; ifaca = ifaca->aca_next, ip_idx++) { if (ip_idx < s_ip_idx) continue; - err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); + err = inet6_fill_ifacaddr(skb, ifaca, fillargs); if (err < 0) break; } @@ -5002,13 +5014,71 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb, return err; } +static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, + struct inet6_fill_args *fillargs, + struct net **tgt_net, struct sock *sk, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFA_MAX+1]; + struct ifaddrmsg *ifm; + int err, i; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); + return -EINVAL; + } + if (ifm->ifa_index) { + NL_SET_ERR_MSG_MOD(extack, "Filter by device index not supported for address dump"); + return -EINVAL; + } + + err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFA_MAX, + ifa_ipv6_policy, extack); + if (err < 0) + return err; + + for (i = 0; i <= IFA_MAX; ++i) { + if (!tb[i]) + continue; + + if (i == IFA_TARGET_NETNSID) { + struct net *net; + + fillargs->netnsid = nla_get_s32(tb[i]); + net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); + if (IS_ERR(net)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id"); + return PTR_ERR(net); + } + *tgt_net = net; + } else { + NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request"); + return -EINVAL; + } + } + + return 0; +} + static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, enum addr_type_t type) { + const struct nlmsghdr *nlh = cb->nlh; + struct inet6_fill_args fillargs = { + .portid = NETLINK_CB(cb->skb).portid, + .seq = cb->nlh->nlmsg_seq, + .flags = NLM_F_MULTI, + .netnsid = -1, + .type = type, + }; struct net *net = sock_net(skb->sk); - struct nlattr *tb[IFA_MAX+1]; struct net *tgt_net = net; - int netnsid = -1; int h, s_h; int idx, ip_idx; int s_idx, s_ip_idx; @@ -5020,15 +5090,13 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, s_idx = idx = cb->args[1]; s_ip_idx = ip_idx = cb->args[2]; - if (nlmsg_parse(cb->nlh, sizeof(struct ifaddrmsg), tb, IFA_MAX, - ifa_ipv6_policy, NULL) >= 0) { - if (tb[IFA_TARGET_NETNSID]) { - netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]); + if (cb->strict_check) { + int err; - tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); - if (IS_ERR(tgt_net)) - return PTR_ERR(tgt_net); - } + err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, + skb->sk, cb->extack); + if (err < 0) + return err; } rcu_read_lock(); @@ -5046,8 +5114,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, if (!idev) goto cont; - if (in6_dump_addrs(idev, skb, cb, type, - s_ip_idx, &ip_idx, netnsid) < 0) + if (in6_dump_addrs(idev, skb, cb, s_ip_idx, &ip_idx, + &fillargs) < 0) goto done; cont: idx++; @@ -5058,7 +5126,7 @@ done: cb->args[0] = h; cb->args[1] = idx; cb->args[2] = ip_idx; - if (netnsid >= 0) + if (fillargs.netnsid >= 0) put_net(tgt_net); return skb->len; @@ -5592,6 +5660,31 @@ nla_put_failure: return -EMSGSIZE; } +static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifinfomsg *ifm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header"); + return -EINVAL; + } + + ifm = nlmsg_data(nlh); + if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || + ifm->ifi_change || ifm->ifi_index) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); + return -EINVAL; + } + + return 0; +} + static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -5601,6 +5694,16 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct inet6_dev *idev; struct hlist_head *head; + /* only requests using strict checking can pass data to + * influence the dump + */ + if (cb->strict_check) { + int err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack); + + if (err < 0) + return err; + } + s_h = cb->args[0]; s_idx = cb->args[1]; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 1d6ced37ad71..0d1ee82ee55b 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -458,20 +458,52 @@ static int ip6addrlbl_fill(struct sk_buff *skb, return 0; } +static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct ifaddrlblmsg *ifal; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); + return -EINVAL; + } + + ifal = nlmsg_data(nlh); + if (ifal->__ifal_reserved || ifal->ifal_prefixlen || + ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ifal))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump requewst"); + return -EINVAL; + } + + return 0; +} + static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct ip6addrlbl_entry *p; int idx = 0, s_idx = cb->args[0]; int err; + if (cb->strict_check) { + err = ip6addrlbl_valid_dump_req(nlh, cb->extack); + if (err < 0) + return err; + } + rcu_read_lock(); hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { if (idx >= s_idx) { err = ip6addrlbl_fill(skb, p, net->ipv6.ip6addrlbl_table.seq, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWADDRLABEL, NLM_F_MULTI); if (err < 0) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cf709eadc932..9ba72d94d60f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -47,6 +47,7 @@ struct fib6_cleaner { int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; + bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES @@ -564,6 +565,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; @@ -573,6 +575,13 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_head *head; int res = 0; + if (cb->strict_check) { + int err = ip_valid_fib_dump_req(nlh, cb->extack); + + if (err < 0) + return err; + } + s_h = cb->args[0]; s_e = cb->args[1]; @@ -1948,6 +1957,7 @@ static int fib6_clean_node(struct fib6_walker *w) struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, + .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && @@ -1999,7 +2009,7 @@ static int fib6_clean_node(struct fib6_walker *w) static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), - int sernum, void *arg) + int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; @@ -2011,13 +2021,14 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.sernum = sernum; c.arg = arg; c.net = net; + c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), - int sernum, void *arg) + int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; @@ -2029,7 +2040,7 @@ static void __fib6_clean_all(struct net *net, hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, sernum, arg); + func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } @@ -2039,14 +2050,21 @@ static void __fib6_clean_all(struct net *net, void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { - __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); +} + +void fib6_clean_all_skip_notify(struct net *net, + int (*func)(struct fib6_info *, void *), + void *arg) +{ + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); - __fib6_clean_all(net, NULL, new_sernum, NULL); + __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6f07b8380425..d7563ef76518 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2457,6 +2457,15 @@ errout: static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; + + if (cb->strict_check) { + int err = ip_valid_fib_dump_req(nlh, cb->extack); + + if (err < 0) + return err; + } + return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 51863ada15a4..a25cfdd47c89 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1784,6 +1784,8 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); + if (!netif_carrier_ok(dev)) + neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c index 8b147440fbdc..af737b47b9b5 100644 --- a/net/ipv6/netfilter/ip6t_ipv6header.c +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -65,7 +65,10 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) } hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - BUG_ON(hp == NULL); + if (!hp) { + par->hotdrop = true; + return false; + } /* Calculate the header length */ if (nexthdr == NEXTHDR_FRAGMENT) diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 2c99b94eeca3..21bf6bf04323 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -137,7 +137,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) sizeof(_addr), &_addr); - BUG_ON(ap == NULL); + if (ap == NULL) { + par->hotdrop = true; + return false; + } if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { pr_debug("i=%d temp=%d;\n", i, temp); @@ -166,7 +169,10 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) + temp * sizeof(_addr), sizeof(_addr), &_addr); - BUG_ON(ap == NULL); + if (ap == NULL) { + par->hotdrop = true; + return false; + } if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index e6eb7cf9b54f..3e4bf2286abe 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -87,18 +87,30 @@ static struct notifier_block masq_dev_notifier = { struct masq_dev_work { struct work_struct work; struct net *net; + struct in6_addr addr; int ifindex; }; +static int inet_cmp(struct nf_conn *ct, void *work) +{ + struct masq_dev_work *w = (struct masq_dev_work *)work; + struct nf_conntrack_tuple *tuple; + + if (!device_cmp(ct, (void *)(long)w->ifindex)) + return 0; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); +} + static void iterate_cleanup_work(struct work_struct *work) { struct masq_dev_work *w; - long index; w = container_of(work, struct masq_dev_work, work); - index = w->ifindex; - nf_ct_iterate_cleanup_net(w->net, device_cmp, (void *)index, 0, 0); + nf_ct_iterate_cleanup_net(w->net, inet_cmp, (void *)w, 0, 0); put_net(w->net); kfree(w); @@ -147,6 +159,7 @@ static int masq_inet_event(struct notifier_block *this, INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = dev->ifindex; w->net = net; + w->addr = ifa->addr; schedule_work(&w->work); return NOTIFY_DONE; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 74d97addf1af..f4e08b0689a8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4026,8 +4026,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event) .event = event, }, }; + struct net *net = dev_net(dev); - fib6_clean_all(dev_net(dev), fib6_ifdown, &arg); + if (net->ipv6.sysctl.skip_notify_on_dev_down) + fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); + else + fib6_clean_all(net, fib6_ifdown, &arg); } void rt6_disable_ip(struct net_device *dev, unsigned long event) @@ -4117,7 +4121,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, - NULL); + extack); if (err < 0) goto errout; @@ -5031,7 +5035,10 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, return 0; } -struct ctl_table ipv6_route_table_template[] = { +static int zero; +static int one = 1; + +static struct ctl_table ipv6_route_table_template[] = { { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, @@ -5102,6 +5109,15 @@ struct ctl_table ipv6_route_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, + { + .procname = "skip_notify_on_dev_down", + .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -5125,6 +5141,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; + table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -5189,6 +5206,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + net->ipv6.sysctl.skip_notify_on_dev_down = 0; net->ipv6.ip6_rt_gc_expire = 30*HZ; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8fbe6cdbe255..5fe274c47c41 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1223,7 +1223,7 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_mpls_policy, NULL); + devconf_mpls_policy, extack); if (err < 0) goto errout; @@ -1263,6 +1263,7 @@ errout: static int mpls_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct hlist_head *head; struct net_device *dev; @@ -1270,6 +1271,21 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, int idx, s_idx; int h, s_h; + if (cb->strict_check) { + struct netlink_ext_ack *extack = cb->extack; + struct netconfmsg *ncm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*ncm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request"); + return -EINVAL; + } + } + s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -1286,7 +1302,7 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, goto cont; if (mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, + nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL) < 0) { @@ -2015,8 +2031,43 @@ nla_put_failure: return -EMSGSIZE; } +#if IS_ENABLED(CONFIG_INET) +static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return ip_valid_fib_dump_req(nlh, extack); +} +#else +static int mpls_valid_fib_dump_req(const struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct rtmsg *rtm; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); + return -EINVAL; + } + + rtm = nlmsg_data(nlh); + if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || + rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || + rtm->rtm_type || rtm->rtm_flags) { + NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for FIB dump request"); + return -EINVAL; + } + + if (nlmsg_attrlen(nlh, sizeof(*rtm))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in FIB dump request"); + return -EINVAL; + } + + return 0; +} +#endif + static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) { + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; size_t platform_labels; @@ -2024,6 +2075,13 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) ASSERT_RTNL(); + if (cb->strict_check) { + int err = mpls_valid_fib_dump_req(nlh, cb->extack); + + if (err < 0) + return err; + } + index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f61c306de1d0..2ab870ef233a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -625,6 +625,13 @@ config NFT_FIB_INET The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_XFRM + tristate "Netfilter nf_tables xfrm/IPSec security association matching" + depends on XFRM + help + This option adds an expression that you can use to extract properties + of a packets security association. + config NFT_SOCKET tristate "Netfilter nf_tables socket match support" depends on IPV6 || IPV6=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 16895e045b66..4ddf3ef51ece 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o +obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 62eefea48973..83395bf6dc35 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3234,7 +3234,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, /* Try to find the service for which to dump destinations */ if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, - ip_vs_cmd_policy, NULL)) + ip_vs_cmd_policy, cb->extack)) goto out_err; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a676d5f76bdc..ca1168d67fac 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -379,7 +379,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, return false; } - l4proto = __nf_ct_l4proto_find(l3num, protonum); + l4proto = __nf_ct_l4proto_find(protonum); ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, l4proto); @@ -539,7 +539,7 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_tmpl_free(ct); return; } - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->destroy) l4proto->destroy(ct); @@ -840,7 +840,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, enum ip_conntrack_info oldinfo; struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->allow_clash && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { @@ -1109,7 +1109,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) return true; @@ -1370,12 +1370,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; - if (!l4proto->new(ct, skb, dataoff)) { - nf_conntrack_free(ct); - pr_debug("can't track with proto module\n"); - return NULL; - } - if (timeout_ext) nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), GFP_ATOMIC); @@ -1436,12 +1430,12 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, /* On success, returns 0, sets skb->_nfct | ctinfo */ static int -resolve_normal_ct(struct net *net, struct nf_conn *tmpl, +resolve_normal_ct(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, - u_int16_t l3num, u_int8_t protonum, - const struct nf_conntrack_l4proto *l4proto) + const struct nf_conntrack_l4proto *l4proto, + const struct nf_hook_state *state) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; @@ -1452,17 +1446,18 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), - dataoff, l3num, protonum, net, &tuple, l4proto)) { + dataoff, state->pf, protonum, state->net, + &tuple, l4proto)) { pr_debug("Can't get tuple\n"); return 0; } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, net); - h = __nf_conntrack_find_get(net, zone, &tuple, hash); + hash = hash_conntrack_raw(&tuple, state->net); + h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); if (!h) { - h = init_conntrack(net, tmpl, &tuple, l4proto, + h = init_conntrack(state->net, tmpl, &tuple, l4proto, skb, dataoff, hash); if (!h) return 0; @@ -1491,13 +1486,45 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, return 0; } +/* + * icmp packets need special treatment to handle error messages that are + * related to a connection. + * + * Callers need to check if skb has a conntrack assigned when this + * helper returns; in such case skb belongs to an already known connection. + */ +static unsigned int __cold +nf_conntrack_handle_icmp(struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + u8 protonum, + const struct nf_hook_state *state) +{ + int ret; + + if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) + ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); +#if IS_ENABLED(CONFIG_IPV6) + else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) + ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); +#endif + else + return NF_ACCEPT; + + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(state->net, error); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); + } + + return ret; +} + unsigned int -nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, - struct sk_buff *skb) +nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conntrack_l4proto *l4proto; - struct nf_conn *ct, *tmpl; enum ip_conntrack_info ctinfo; + struct nf_conn *ct, *tmpl; u_int8_t protonum; int dataoff, ret; @@ -1506,32 +1533,28 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || ctinfo == IP_CT_UNTRACKED) { - NF_CT_STAT_INC_ATOMIC(net, ignore); + NF_CT_STAT_INC_ATOMIC(state->net, ignore); return NF_ACCEPT; } skb->_nfct = 0; } /* rcu_read_lock()ed by nf_hook_thresh */ - dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum); + dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); if (dataoff <= 0) { pr_debug("not prepared to track yet or error occurred\n"); - NF_CT_STAT_INC_ATOMIC(net, error); - NF_CT_STAT_INC_ATOMIC(net, invalid); + NF_CT_STAT_INC_ATOMIC(state->net, error); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } - l4proto = __nf_ct_l4proto_find(pf, protonum); + l4proto = __nf_ct_l4proto_find(protonum); - /* It may be an special packet, error, unclean... - * inverse of the return code tells to the netfilter - * core what to do with the packet. */ - if (l4proto->error != NULL) { - ret = l4proto->error(net, tmpl, skb, dataoff, pf, hooknum); + if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { + ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, + protonum, state); if (ret <= 0) { - NF_CT_STAT_INC_ATOMIC(net, error); - NF_CT_STAT_INC_ATOMIC(net, invalid); ret = -ret; goto out; } @@ -1540,10 +1563,11 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } repeat: - ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto); + ret = resolve_normal_ct(tmpl, skb, dataoff, + protonum, l4proto, state); if (ret < 0) { /* Too stressed to deal. */ - NF_CT_STAT_INC_ATOMIC(net, drop); + NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = NF_DROP; goto out; } @@ -1551,21 +1575,21 @@ repeat: ct = nf_ct_get(skb, &ctinfo); if (!ct) { /* Not valid part of a connection */ - NF_CT_STAT_INC_ATOMIC(net, invalid); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); ret = NF_ACCEPT; goto out; } - ret = l4proto->packet(ct, skb, dataoff, ctinfo); + ret = l4proto->packet(ct, skb, dataoff, ctinfo, state); if (ret <= 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_conntrack_put(&ct->ct_general); skb->_nfct = 0; - NF_CT_STAT_INC_ATOMIC(net, invalid); + NF_CT_STAT_INC_ATOMIC(state->net, invalid); if (ret == -NF_DROP) - NF_CT_STAT_INC_ATOMIC(net, drop); + NF_CT_STAT_INC_ATOMIC(state->net, drop); /* Special case: TCP tracker reports an attempt to reopen a * closed/aborted connection. We have to go back and create a * fresh conntrack. @@ -1594,8 +1618,7 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, rcu_read_lock(); ret = nf_ct_invert_tuple(inverse, orig, - __nf_ct_l4proto_find(orig->src.l3num, - orig->dst.protonum)); + __nf_ct_l4proto_find(orig->dst.protonum)); rcu_read_unlock(); return ret; } @@ -1752,7 +1775,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb) if (dataoff <= 0) return -1; - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, l4num, net, &tuple, l4proto)) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 27b84231db10..3034038bfdf0 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -610,8 +610,7 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l4proto_find(expect->tuple.src.l3num, - expect->tuple.dst.protonum)); + __nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 036207ecaf16..4ae8e528943a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -135,8 +135,7 @@ static int ctnetlink_dump_tuples(struct sk_buff *skb, ret = ctnetlink_dump_tuples_ip(skb, tuple); if (ret >= 0) { - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, - tuple->dst.protonum); + l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); } rcu_read_unlock(); @@ -184,7 +183,7 @@ static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) struct nlattr *nest_proto; int ret; - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->to_nlattr) return 0; @@ -592,7 +591,7 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct) len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1); len *= 3u; /* ORIG, REPLY, MASTER */ - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); len += l4proto->nlattr_size; if (l4proto->nlattr_tuple_size) { len4 = l4proto->nlattr_tuple_size(); @@ -821,6 +820,7 @@ static int ctnetlink_done(struct netlink_callback *cb) } struct ctnetlink_filter { + u8 family; struct { u_int32_t val; u_int32_t mask; @@ -828,31 +828,39 @@ struct ctnetlink_filter { }; static struct ctnetlink_filter * -ctnetlink_alloc_filter(const struct nlattr * const cda[]) +ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { -#ifdef CONFIG_NF_CONNTRACK_MARK struct ctnetlink_filter *filter; +#ifndef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) + return ERR_PTR(-EOPNOTSUPP); +#endif + filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) return ERR_PTR(-ENOMEM); - filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->family = family; - return filter; -#else - return ERR_PTR(-EOPNOTSUPP); +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); + filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + } #endif + return filter; } static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; struct ctnetlink_filter *filter = NULL; + struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + u8 family = nfmsg->nfgen_family; - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { - filter = ctnetlink_alloc_filter(cda); + if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } @@ -866,13 +874,24 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) struct ctnetlink_filter *filter = data; if (filter == NULL) - return 1; + goto out; + + /* Match entries of a given L3 protocol number. + * If it is not specified, ie. l3proto == 0, + * then match everything. + */ + if (filter->family && nf_ct_l3num(ct) != filter->family) + goto ignore_entry; #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) == filter->mark.val) - return 1; + if ((ct->mark & filter->mark.mask) != filter->mark.val) + goto ignore_entry; #endif +out: + return 1; + +ignore_entry: return 0; } @@ -883,8 +902,6 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - u_int8_t l3proto = nfmsg->nfgen_family; struct nf_conn *nf_ct_evict[8]; int res, i; spinlock_t *lockp; @@ -923,11 +940,6 @@ restart: if (!net_eq(net, nf_ct_net(ct))) continue; - /* Dump entries of a given L3 protocol number. - * If it is not specified, ie. l3proto == 0, - * then dump everything. */ - if (l3proto && nf_ct_l3num(ct) != l3proto) - continue; if (cb->args[1]) { if (ct != last) continue; @@ -1048,7 +1060,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); + l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_PROTO_MAX, @@ -1213,12 +1225,12 @@ static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) static int ctnetlink_flush_conntrack(struct net *net, const struct nlattr * const cda[], - u32 portid, int report) + u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { - filter = ctnetlink_alloc_filter(cda); + if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); } @@ -1257,7 +1269,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, else { return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(nlh), u3); } if (err < 0) @@ -1696,7 +1708,7 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct, return err; rcu_read_lock(); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); if (l4proto->from_nlattr) err = l4proto->from_nlattr(tb, ct); rcu_read_unlock(); @@ -2656,8 +2668,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, rcu_read_lock(); ret = ctnetlink_dump_tuples_ip(skb, &m); if (ret >= 0) { - l4proto = __nf_ct_l4proto_find(tuple->src.l3num, - tuple->dst.protonum); + l4proto = __nf_ct_l4proto_find(tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, &m, l4proto); } rcu_read_unlock(); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 51c5d7eec0a3..40643af7137e 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -43,7 +43,7 @@ extern unsigned int nf_conntrack_net_id; -static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly; +static struct nf_conntrack_l4proto __rcu *nf_ct_protos[MAX_NF_CT_PROTO + 1] __read_mostly; static DEFINE_MUTEX(nf_ct_proto_mutex); @@ -124,23 +124,21 @@ void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_ct_l4proto_log_invalid); #endif -const struct nf_conntrack_l4proto * -__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) +const struct nf_conntrack_l4proto *__nf_ct_l4proto_find(u8 l4proto) { - if (unlikely(l3proto >= NFPROTO_NUMPROTO || nf_ct_protos[l3proto] == NULL)) + if (unlikely(l4proto >= ARRAY_SIZE(nf_ct_protos))) return &nf_conntrack_l4proto_generic; - return rcu_dereference(nf_ct_protos[l3proto][l4proto]); + return rcu_dereference(nf_ct_protos[l4proto]); } EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); -const struct nf_conntrack_l4proto * -nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num) +const struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u8 l4num) { const struct nf_conntrack_l4proto *p; rcu_read_lock(); - p = __nf_ct_l4proto_find(l3num, l4num); + p = __nf_ct_l4proto_find(l4num); if (!try_module_get(p->me)) p = &nf_conntrack_l4proto_generic; rcu_read_unlock(); @@ -159,8 +157,7 @@ static int kill_l4proto(struct nf_conn *i, void *data) { const struct nf_conntrack_l4proto *l4proto; l4proto = data; - return nf_ct_protonum(i) == l4proto->l4proto && - nf_ct_l3num(i) == l4proto->l3proto; + return nf_ct_protonum(i) == l4proto->l4proto; } static struct nf_proto_net *nf_ct_l4proto_net(struct net *net, @@ -219,48 +216,20 @@ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto) { int ret = 0; - if (l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)) - return -EBUSY; - if ((l4proto->to_nlattr && l4proto->nlattr_size == 0) || (l4proto->tuple_to_nlattr && !l4proto->nlattr_tuple_size)) return -EINVAL; mutex_lock(&nf_ct_proto_mutex); - if (!nf_ct_protos[l4proto->l3proto]) { - /* l3proto may be loaded latter. */ - struct nf_conntrack_l4proto __rcu **proto_array; - int i; - - proto_array = - kmalloc_array(MAX_NF_CT_PROTO, - sizeof(struct nf_conntrack_l4proto *), - GFP_KERNEL); - if (proto_array == NULL) { - ret = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < MAX_NF_CT_PROTO; i++) - RCU_INIT_POINTER(proto_array[i], - &nf_conntrack_l4proto_generic); - - /* Before making proto_array visible to lockless readers, - * we must make sure its content is committed to memory. - */ - smp_wmb(); - - nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (rcu_dereference_protected( - nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + if (rcu_dereference_protected( + nf_ct_protos[l4proto->l4proto], lockdep_is_held(&nf_ct_proto_mutex) ) != &nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], - l4proto); + rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], l4proto); out_unlock: mutex_unlock(&nf_ct_proto_mutex); return ret; @@ -274,7 +243,7 @@ int nf_ct_l4proto_pernet_register_one(struct net *net, struct nf_proto_net *pn = NULL; if (l4proto->init_net) { - ret = l4proto->init_net(net, l4proto->l3proto); + ret = l4proto->init_net(net); if (ret < 0) goto out; } @@ -296,13 +265,13 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register_one); static void __nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *l4proto) { - BUG_ON(l4proto->l3proto >= ARRAY_SIZE(nf_ct_protos)); + BUG_ON(l4proto->l4proto >= ARRAY_SIZE(nf_ct_protos)); BUG_ON(rcu_dereference_protected( - nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + nf_ct_protos[l4proto->l4proto], lockdep_is_held(&nf_ct_proto_mutex) ) != l4proto); - rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + rcu_assign_pointer(nf_ct_protos[l4proto->l4proto], &nf_conntrack_l4proto_generic); } @@ -352,7 +321,7 @@ static int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], unsigned int num_proto) { - int ret = -EINVAL, ver; + int ret = -EINVAL; unsigned int i; for (i = 0; i < num_proto; i++) { @@ -361,9 +330,8 @@ nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[], break; } if (i != num_proto) { - ver = l4proto[i]->l3proto == PF_INET6 ? 6 : 4; - pr_err("nf_conntrack_ipv%d: can't register l4 %d proto.\n", - ver, l4proto[i]->l4proto); + pr_err("nf_conntrack: can't register l4 %d proto.\n", + l4proto[i]->l4proto); nf_ct_l4proto_unregister(l4proto, i); } return ret; @@ -382,9 +350,8 @@ int nf_ct_l4proto_pernet_register(struct net *net, break; } if (i != num_proto) { - pr_err("nf_conntrack_proto_%d %d: pernet registration failed\n", - l4proto[i]->l4proto, - l4proto[i]->l3proto == PF_INET6 ? 6 : 4); + pr_err("nf_conntrack %d: pernet registration failed\n", + l4proto[i]->l4proto); nf_ct_l4proto_pernet_unregister(net, l4proto, i); } return ret; @@ -455,7 +422,7 @@ static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); + return nf_conntrack_in(skb, state); } static unsigned int ipv4_conntrack_local(void *priv, @@ -477,7 +444,7 @@ static unsigned int ipv4_conntrack_local(void *priv, return NF_ACCEPT; } - return nf_conntrack_in(state->net, PF_INET, state->hook, skb); + return nf_conntrack_in(skb, state); } /* Connection tracking may drop packets, but never alters them, so @@ -690,14 +657,14 @@ static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); + return nf_conntrack_in(skb, state); } static unsigned int ipv6_conntrack_local(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return nf_conntrack_in(state->net, PF_INET6, state->hook, skb); + return nf_conntrack_in(skb, state); } static unsigned int ipv6_helper(void *priv, @@ -911,37 +878,26 @@ void nf_ct_netns_put(struct net *net, uint8_t nfproto) EXPORT_SYMBOL_GPL(nf_ct_netns_put); static const struct nf_conntrack_l4proto * const builtin_l4proto[] = { - &nf_conntrack_l4proto_tcp4, - &nf_conntrack_l4proto_udp4, + &nf_conntrack_l4proto_tcp, + &nf_conntrack_l4proto_udp, &nf_conntrack_l4proto_icmp, #ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp4, + &nf_conntrack_l4proto_dccp, #endif #ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp4, + &nf_conntrack_l4proto_sctp, #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite4, + &nf_conntrack_l4proto_udplite, #endif #if IS_ENABLED(CONFIG_IPV6) - &nf_conntrack_l4proto_tcp6, - &nf_conntrack_l4proto_udp6, &nf_conntrack_l4proto_icmpv6, -#ifdef CONFIG_NF_CT_PROTO_DCCP - &nf_conntrack_l4proto_dccp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_SCTP - &nf_conntrack_l4proto_sctp6, -#endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE - &nf_conntrack_l4proto_udplite6, -#endif #endif /* CONFIG_IPV6 */ }; int nf_conntrack_proto_init(void) { - int ret = 0; + int ret = 0, i; ret = nf_register_sockopt(&so_getorigdst); if (ret < 0) @@ -952,6 +908,11 @@ int nf_conntrack_proto_init(void) if (ret < 0) goto cleanup_sockopt; #endif + + for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) + RCU_INIT_POINTER(nf_ct_protos[i], + &nf_conntrack_l4proto_generic); + ret = nf_ct_l4proto_register(builtin_l4proto, ARRAY_SIZE(builtin_l4proto)); if (ret < 0) @@ -969,17 +930,10 @@ cleanup_sockopt: void nf_conntrack_proto_fini(void) { - unsigned int i; - nf_unregister_sockopt(&so_getorigdst); #if IS_ENABLED(CONFIG_IPV6) nf_unregister_sockopt(&so_getorigdst6); #endif - /* No need to call nf_ct_l4proto_unregister(), the register - * tables are free'd here anyway. - */ - for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++) - kfree(nf_ct_protos[i]); } int nf_conntrack_proto_pernet_init(struct net *net) @@ -988,8 +942,7 @@ int nf_conntrack_proto_pernet_init(struct net *net) struct nf_proto_net *pn = nf_ct_l4proto_net(net, &nf_conntrack_l4proto_generic); - err = nf_conntrack_l4proto_generic.init_net(net, - nf_conntrack_l4proto_generic.l3proto); + err = nf_conntrack_l4proto_generic.init_net(net); if (err < 0) return err; err = nf_ct_l4proto_register_sysctl(net, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index f3f91ed2c21a..171e9e122e5f 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -389,18 +389,15 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net) return &net->ct.nf_ct_proto.dccp; } -static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) +static noinline bool +dccp_new(struct nf_conn *ct, const struct sk_buff *skb, + const struct dccp_hdr *dh) { struct net *net = nf_ct_net(ct); struct nf_dccp_net *dn; - struct dccp_hdr _dh, *dh; const char *msg; u_int8_t state; - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - BUG_ON(dh == NULL); - state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; switch (state) { default: @@ -438,8 +435,51 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) ntohl(dhack->dccph_ack_nr_low); } -static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff, enum ip_conntrack_info ctinfo) +static bool dccp_error(const struct dccp_hdr *dh, + struct sk_buff *skb, unsigned int dataoff, + const struct nf_hook_state *state) +{ + unsigned int dccp_len = skb->len - dataoff; + unsigned int cscov; + const char *msg; + + if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || + dh->dccph_doff * 4 > dccp_len) { + msg = "nf_ct_dccp: truncated/malformed packet "; + goto out_invalid; + } + + cscov = dccp_len; + if (dh->dccph_cscov) { + cscov = (dh->dccph_cscov - 1) * 4; + if (cscov > dccp_len) { + msg = "nf_ct_dccp: bad checksum coverage "; + goto out_invalid; + } + } + + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_checksum_partial(skb, state->hook, dataoff, cscov, + IPPROTO_DCCP, state->pf)) { + msg = "nf_ct_dccp: bad checksum "; + goto out_invalid; + } + + if (dh->dccph_type >= DCCP_PKT_INVALID) { + msg = "nf_ct_dccp: reserved packet type "; + goto out_invalid; + } + return false; +out_invalid: + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_DCCP, "%s", msg); + return true; +} + +static int dccp_packet(struct nf_conn *ct, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; @@ -448,8 +488,15 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int *timeouts; dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - BUG_ON(dh == NULL); + if (!dh) + return NF_DROP; + + if (dccp_error(dh, skb, dataoff, state)) + return -NF_ACCEPT; + type = dh->dccph_type; + if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh)) + return -NF_ACCEPT; if (type == DCCP_PKT_RESET && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { @@ -527,55 +574,6 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, return NF_ACCEPT; } -static int dccp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u_int8_t pf, unsigned int hooknum) -{ - struct dccp_hdr _dh, *dh; - unsigned int dccp_len = skb->len - dataoff; - unsigned int cscov; - const char *msg; - - dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh); - if (dh == NULL) { - msg = "nf_ct_dccp: short packet "; - goto out_invalid; - } - - if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || - dh->dccph_doff * 4 > dccp_len) { - msg = "nf_ct_dccp: truncated/malformed packet "; - goto out_invalid; - } - - cscov = dccp_len; - if (dh->dccph_cscov) { - cscov = (dh->dccph_cscov - 1) * 4; - if (cscov > dccp_len) { - msg = "nf_ct_dccp: bad checksum coverage "; - goto out_invalid; - } - } - - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, - pf)) { - msg = "nf_ct_dccp: bad checksum "; - goto out_invalid; - } - - if (dh->dccph_type >= DCCP_PKT_INVALID) { - msg = "nf_ct_dccp: reserved packet type "; - goto out_invalid; - } - - return NF_ACCEPT; - -out_invalid: - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_DCCP, "%s", msg); - return -NF_ACCEPT; -} - static bool dccp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.dccp.state) { @@ -814,7 +812,7 @@ static int dccp_kmemdup_sysctl_table(struct net *net, struct nf_proto_net *pn, return 0; } -static int dccp_init_net(struct net *net, u_int16_t proto) +static int dccp_init_net(struct net *net) { struct nf_dccp_net *dn = dccp_pernet(net); struct nf_proto_net *pn = &dn->pn; @@ -844,45 +842,9 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.dccp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = { - .l3proto = AF_INET, - .l4proto = IPPROTO_DCCP, - .new = dccp_new, - .packet = dccp_packet, - .error = dccp_error, - .can_early_drop = dccp_can_early_drop, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = dccp_print_conntrack, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = DCCP_NLATTR_SIZE, - .to_nlattr = dccp_to_nlattr, - .from_nlattr = nlattr_to_dccp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = dccp_timeout_nlattr_to_obj, - .obj_to_nlattr = dccp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_DCCP_MAX, - .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, - .nla_policy = dccp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = dccp_init_net, - .get_net_proto = dccp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { - .l3proto = AF_INET6, +const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { .l4proto = IPPROTO_DCCP, - .new = dccp_new, .packet = dccp_packet, - .error = dccp_error, .can_early_drop = dccp_can_early_drop, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = dccp_print_conntrack, @@ -908,4 +870,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = { .init_net = dccp_init_net, .get_net_proto = dccp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 1df3244ecd07..e10e867e0b55 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -44,12 +44,19 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb, /* Returns verdict for packet, or -1 for invalid. */ static int generic_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { const unsigned int *timeout = nf_ct_timeout_lookup(ct); + if (!nf_generic_should_process(nf_ct_protonum(ct))) { + pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", + nf_ct_protonum(ct)); + return -NF_ACCEPT; + } + if (!timeout) timeout = &generic_pernet(nf_ct_net(ct))->timeout; @@ -57,19 +64,6 @@ static int generic_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - bool ret; - - ret = nf_generic_should_process(nf_ct_protonum(ct)); - if (!ret) - pr_warn_once("conntrack: generic helper won't handle protocol %d. Please consider loading the specific helper module.\n", - nf_ct_protonum(ct)); - return ret; -} - #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> @@ -142,7 +136,7 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int generic_init_net(struct net *net, u_int16_t proto) +static int generic_init_net(struct net *net) { struct nf_generic_net *gn = generic_pernet(net); struct nf_proto_net *pn = &gn->pn; @@ -159,11 +153,9 @@ static struct nf_proto_net *generic_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { - .l3proto = PF_UNSPEC, .l4proto = 255, .pkt_to_tuple = generic_pkt_to_tuple, .packet = generic_packet, - .new = generic_new, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 650eb4fba2c5..9b48dc8b4b88 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -233,10 +233,26 @@ static unsigned int *gre_get_timeouts(struct net *net) /* Returns verdict for packet, and may modify conntrack */ static int gre_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { + if (state->pf != NFPROTO_IPV4) + return -NF_ACCEPT; + + if (!nf_ct_is_confirmed(ct)) { + unsigned int *timeouts = nf_ct_timeout_lookup(ct); + + if (!timeouts) + timeouts = gre_get_timeouts(nf_ct_net(ct)); + + /* initialize to sane value. Ideally a conntrack helper + * (e.g. in case of pptp) is increasing them */ + ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; + ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; + } + /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ if (ct->status & IPS_SEEN_REPLY) { @@ -252,26 +268,6 @@ static int gre_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - unsigned int *timeouts = nf_ct_timeout_lookup(ct); - - if (!timeouts) - timeouts = gre_get_timeouts(nf_ct_net(ct)); - - pr_debug(": "); - nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - - /* initialize to sane value. Ideally a conntrack helper - * (e.g. in case of pptp) is increasing them */ - ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; - ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; - - return true; -} - /* Called when a conntrack entry has already been removed from the hashes * and is about to be deleted from memory */ static void gre_destroy(struct nf_conn *ct) @@ -336,7 +332,7 @@ gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -static int gre_init_net(struct net *net, u_int16_t proto) +static int gre_init_net(struct net *net) { struct netns_proto_gre *net_gre = gre_pernet(net); int i; @@ -351,14 +347,12 @@ static int gre_init_net(struct net *net, u_int16_t proto) /* protocol helper struct */ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = { - .l3proto = AF_INET, .l4proto = IPPROTO_GRE, .pkt_to_tuple = gre_pkt_to_tuple, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif .packet = gre_packet, - .new = gre_new, .destroy = gre_destroy, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index 43c7e1a217b9..3598520bd19b 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -72,34 +72,17 @@ static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, return true; } -static unsigned int *icmp_get_timeouts(struct net *net) -{ - return &icmp_pernet(net)->timeout; -} - /* Returns verdict for packet, or -1 for invalid. */ static int icmp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ unsigned int *timeout = nf_ct_timeout_lookup(ct); - - if (!timeout) - timeout = icmp_get_timeouts(nf_ct_net(ct)); - - nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); - - return NF_ACCEPT; -} - -/* Called when a new connection for this protocol found. */ -static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, @@ -107,21 +90,29 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, [ICMP_ADDRESS] = 1 }; + if (state->pf != NFPROTO_IPV4) + return -NF_ACCEPT; + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); - return false; + return -NF_ACCEPT; } - return true; + + if (!timeout) + timeout = &icmp_pernet(nf_ct_net(ct))->timeout; + + nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); + return NF_ACCEPT; } /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int -icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int hooknum) +icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, + const struct nf_hook_state *state) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; @@ -137,13 +128,13 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), - PF_INET, net, &origtuple)) { + PF_INET, state->net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } /* rcu_read_lock()ed by nf_hook_thresh */ - innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); + innerproto = __nf_ct_l4proto_find(origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ @@ -154,7 +145,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, zone, &innertuple); + h = nf_conntrack_find_get(state->net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; @@ -168,17 +159,18 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return NF_ACCEPT; } -static void icmp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void icmp_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ -static int -icmp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) +int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, + const struct nf_hook_state *state) { const struct icmphdr *icmph; struct icmphdr _ih; @@ -186,14 +178,15 @@ icmp_error(struct net *net, struct nf_conn *tmpl, /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { - icmp_error_log(skb, net, pf, "short packet"); + icmp_error_log(skb, state, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip_checksum(skb, hooknum, dataoff, 0)) { - icmp_error_log(skb, net, pf, "bad hw icmp checksum"); + if (state->net->ct.sysctl_checksum && + state->hook == NF_INET_PRE_ROUTING && + nf_ip_checksum(skb, state->hook, dataoff, 0)) { + icmp_error_log(skb, state, "bad hw icmp checksum"); return -NF_ACCEPT; } @@ -204,7 +197,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, * discarded. */ if (icmph->type > NR_ICMP_TYPES) { - icmp_error_log(skb, net, pf, "invalid icmp type"); + icmp_error_log(skb, state, "invalid icmp type"); return -NF_ACCEPT; } @@ -216,7 +209,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl, icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(net, tmpl, skb, hooknum); + return icmp_error_message(tmpl, skb, state); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -342,7 +335,7 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int icmp_init_net(struct net *net, u_int16_t proto) +static int icmp_init_net(struct net *net) { struct nf_icmp_net *in = icmp_pernet(net); struct nf_proto_net *pn = &in->pn; @@ -359,13 +352,10 @@ static struct nf_proto_net *icmp_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { - .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, - .new = icmp_new, - .error = icmp_error, .destroy = NULL, .me = NULL, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 97e40f77d678..378618feed5d 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -92,11 +92,31 @@ static unsigned int *icmpv6_get_timeouts(struct net *net) /* Returns verdict for packet, or -1 for invalid. */ static int icmpv6_packet(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo) + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeout = nf_ct_timeout_lookup(ct); + static const u8 valid_new[] = { + [ICMPV6_ECHO_REQUEST - 128] = 1, + [ICMPV6_NI_QUERY - 128] = 1 + }; + + if (state->pf != NFPROTO_IPV6) + return -NF_ACCEPT; + + if (!nf_ct_is_confirmed(ct)) { + int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; + + if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { + /* Can't create a new ICMPv6 `conn' with this. */ + pr_debug("icmpv6: can't create new conn with type %u\n", + type + 128); + nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); + return -NF_ACCEPT; + } + } if (!timeout) timeout = icmpv6_get_timeouts(nf_ct_net(ct)); @@ -109,26 +129,6 @@ static int icmpv6_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - static const u_int8_t valid_new[] = { - [ICMPV6_ECHO_REQUEST - 128] = 1, - [ICMPV6_NI_QUERY - 128] = 1 - }; - int type = ct->tuplehash[0].tuple.dst.u.icmp.type - 128; - - if (type < 0 || type >= sizeof(valid_new) || !valid_new[type]) { - /* Can't create a new ICMPv6 `conn' with this. */ - pr_debug("icmpv6: can't create new conn with type %u\n", - type + 128); - nf_ct_dump_tuple_ipv6(&ct->tuplehash[0].tuple); - return false; - } - return true; -} - static int icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, @@ -153,7 +153,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, } /* rcu_read_lock()ed by nf_hook_thresh */ - inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); + inproto = __nf_ct_l4proto_find(origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ @@ -179,16 +179,18 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return NF_ACCEPT; } -static void icmpv6_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void icmpv6_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMPV6, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_ICMPV6, "%s", msg); } -static int -icmpv6_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, unsigned int dataoff, - u8 pf, unsigned int hooknum) +int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, + struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { const struct icmp6hdr *icmp6h; struct icmp6hdr _ih; @@ -196,13 +198,14 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, icmp6h = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih); if (icmp6h == NULL) { - icmpv6_error_log(skb, net, pf, "short packet"); + icmpv6_error_log(skb, state, "short packet"); return -NF_ACCEPT; } - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { - icmpv6_error_log(skb, net, pf, "ICMPv6 checksum failed"); + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_ip6_checksum(skb, state->hook, dataoff, IPPROTO_ICMPV6)) { + icmpv6_error_log(skb, state, "ICMPv6 checksum failed"); return -NF_ACCEPT; } @@ -217,7 +220,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; - return icmpv6_error_message(net, tmpl, skb, dataoff); + return icmpv6_error_message(state->net, tmpl, skb, dataoff); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -343,7 +346,7 @@ static int icmpv6_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int icmpv6_init_net(struct net *net, u_int16_t proto) +static int icmpv6_init_net(struct net *net) { struct nf_icmp_net *in = icmpv6_pernet(net); struct nf_proto_net *pn = &in->pn; @@ -360,13 +363,10 @@ static struct nf_proto_net *icmpv6_get_net_proto(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { - .l3proto = PF_INET6, .l4proto = IPPROTO_ICMPV6, .pkt_to_tuple = icmpv6_pkt_to_tuple, .invert_tuple = icmpv6_invert_tuple, .packet = icmpv6_packet, - .new = icmpv6_new, - .error = icmpv6_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index e4d738d34cd0..3d719d3eb9a3 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -273,11 +273,100 @@ static int sctp_new_state(enum ip_conntrack_dir dir, return sctp_conntracks[dir][i][cur_state]; } +/* Don't need lock here: this conntrack not in circulation yet */ +static noinline bool +sctp_new(struct nf_conn *ct, const struct sk_buff *skb, + const struct sctphdr *sh, unsigned int dataoff) +{ + enum sctp_conntrack new_state; + const struct sctp_chunkhdr *sch; + struct sctp_chunkhdr _sch; + u32 offset, count; + + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); + new_state = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) { + new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (new_state == SCTP_CONNTRACK_NONE || + new_state == SCTP_CONNTRACK_MAX) { + pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); + return false; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + struct sctp_inithdr _inithdr, *ih; + /* Sec 8.5.1 (A) */ + if (sh->vtag) + return false; + + ih = skb_header_pointer(skb, offset + sizeof(_sch), + sizeof(_inithdr), &_inithdr); + if (!ih) + return false; + + pr_debug("Setting vtag %x for new conn\n", + ih->init_tag); + + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; + } else if (sch->type == SCTP_CID_HEARTBEAT) { + pr_debug("Setting vtag %x for secondary conntrack\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; + } else { + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + pr_debug("Setting vtag %x for new conn OOTB\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + ct->proto.sctp.state = new_state; + } + + return true; +} + +static bool sctp_error(struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + const struct sctphdr *sh; + const char *logmsg; + + if (skb->len < dataoff + sizeof(struct sctphdr)) { + logmsg = "nf_ct_sctp: short packet "; + goto out_invalid; + } + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + skb->ip_summed == CHECKSUM_NONE) { + if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { + logmsg = "nf_ct_sctp: failed to read header "; + goto out_invalid; + } + sh = (const struct sctphdr *)(skb->data + dataoff); + if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { + logmsg = "nf_ct_sctp: bad CRC "; + goto out_invalid; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return false; +out_invalid: + nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_SCTP, "%s", logmsg); + return true; +} + /* Returns verdict for packet, or -NF_ACCEPT for invalid. */ static int sctp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { enum sctp_conntrack new_state, old_state; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); @@ -289,6 +378,9 @@ static int sctp_packet(struct nf_conn *ct, unsigned int *timeouts; unsigned long map[256 / sizeof(unsigned long)] = { 0 }; + if (sctp_error(skb, dataoff, state)) + return -NF_ACCEPT; + sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); if (sh == NULL) goto out; @@ -296,6 +388,17 @@ static int sctp_packet(struct nf_conn *ct, if (do_basic_checks(ct, skb, dataoff, map) != 0) goto out; + if (!nf_ct_is_confirmed(ct)) { + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if (test_bit(SCTP_CID_ABORT, map) || + test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || + test_bit(SCTP_CID_COOKIE_ACK, map)) + return -NF_ACCEPT; + + if (!sctp_new(ct, skb, sh, dataoff)) + return -NF_ACCEPT; + } + /* Check the verification tag (Sec 8.5) */ if (!test_bit(SCTP_CID_INIT, map) && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && @@ -397,110 +500,6 @@ out: return -NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - enum sctp_conntrack new_state; - const struct sctphdr *sh; - struct sctphdr _sctph; - const struct sctp_chunkhdr *sch; - struct sctp_chunkhdr _sch; - u_int32_t offset, count; - unsigned long map[256 / sizeof(unsigned long)] = { 0 }; - - sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph); - if (sh == NULL) - return false; - - if (do_basic_checks(ct, skb, dataoff, map) != 0) - return false; - - /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ - if (test_bit(SCTP_CID_ABORT, map) || - test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) || - test_bit(SCTP_CID_COOKIE_ACK, map)) - return false; - - memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); - new_state = SCTP_CONNTRACK_MAX; - for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { - /* Don't need lock here: this conntrack not in circulation yet */ - new_state = sctp_new_state(IP_CT_DIR_ORIGINAL, - SCTP_CONNTRACK_NONE, sch->type); - - /* Invalid: delete conntrack */ - if (new_state == SCTP_CONNTRACK_NONE || - new_state == SCTP_CONNTRACK_MAX) { - pr_debug("nf_conntrack_sctp: invalid new deleting.\n"); - return false; - } - - /* Copy the vtag into the state info */ - if (sch->type == SCTP_CID_INIT) { - struct sctp_inithdr _inithdr, *ih; - /* Sec 8.5.1 (A) */ - if (sh->vtag) - return false; - - ih = skb_header_pointer(skb, offset + sizeof(_sch), - sizeof(_inithdr), &_inithdr); - if (!ih) - return false; - - pr_debug("Setting vtag %x for new conn\n", - ih->init_tag); - - ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; - } else if (sch->type == SCTP_CID_HEARTBEAT) { - pr_debug("Setting vtag %x for secondary conntrack\n", - sh->vtag); - ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; - } - /* If it is a shutdown ack OOTB packet, we expect a return - shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ - else { - pr_debug("Setting vtag %x for new conn OOTB\n", - sh->vtag); - ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; - } - - ct->proto.sctp.state = new_state; - } - - return true; -} - -static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, - unsigned int dataoff, - u8 pf, unsigned int hooknum) -{ - const struct sctphdr *sh; - const char *logmsg; - - if (skb->len < dataoff + sizeof(struct sctphdr)) { - logmsg = "nf_ct_sctp: short packet "; - goto out_invalid; - } - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - skb->ip_summed == CHECKSUM_NONE) { - if (!skb_make_writable(skb, dataoff + sizeof(struct sctphdr))) { - logmsg = "nf_ct_sctp: failed to read header "; - goto out_invalid; - } - sh = (const struct sctphdr *)(skb->data + dataoff); - if (sh->checksum != sctp_compute_cksum(skb, dataoff)) { - logmsg = "nf_ct_sctp: bad CRC "; - goto out_invalid; - } - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return NF_ACCEPT; -out_invalid: - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_SCTP, "%s", logmsg); - return -NF_ACCEPT; -} - static bool sctp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.sctp.state) { @@ -735,7 +734,7 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int sctp_init_net(struct net *net, u_int16_t proto) +static int sctp_init_net(struct net *net) { struct nf_sctp_net *sn = sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; @@ -760,49 +759,12 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.sctp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = { - .l3proto = PF_INET, - .l4proto = IPPROTO_SCTP, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = sctp_print_conntrack, -#endif - .packet = sctp_packet, - .new = sctp_new, - .error = sctp_error, - .can_early_drop = sctp_can_early_drop, - .me = THIS_MODULE, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = SCTP_NLATTR_SIZE, - .to_nlattr = sctp_to_nlattr, - .from_nlattr = nlattr_to_sctp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = sctp_timeout_nlattr_to_obj, - .obj_to_nlattr = sctp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_SCTP_MAX, - .obj_size = sizeof(unsigned int) * SCTP_CONNTRACK_MAX, - .nla_policy = sctp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = sctp_init_net, - .get_net_proto = sctp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { - .l3proto = PF_INET6, +const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp = { .l4proto = IPPROTO_SCTP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = sctp_print_conntrack, #endif .packet = sctp_packet, - .new = sctp_new, - .error = sctp_error, .can_early_drop = sctp_can_early_drop, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -826,4 +788,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = { .init_net = sctp_init_net, .get_net_proto = sctp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 247b89784a6f..1bcf9984d45e 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -717,35 +717,26 @@ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| [TCPHDR_ACK|TCPHDR_URG] = 1, }; -static void tcp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void tcp_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int dataoff, - u_int8_t pf, - unsigned int hooknum) +static bool tcp_error(const struct tcphdr *th, + struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { - const struct tcphdr *th; - struct tcphdr _tcph; unsigned int tcplen = skb->len - dataoff; - u_int8_t tcpflags; - - /* Smaller that minimal TCP header? */ - th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); - if (th == NULL) { - tcp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } + u8 tcpflags; /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - tcp_error_log(skb, net, pf, "truncated packet"); - return -NF_ACCEPT; + tcp_error_log(skb, state, "truncated packet"); + return true; } /* Checksum invalid? Ignore. @@ -753,27 +744,101 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - tcp_error_log(skb, net, pf, "bad checksum"); - return -NF_ACCEPT; + if (state->net->ct.sysctl_checksum && + state->hook == NF_INET_PRE_ROUTING && + nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) { + tcp_error_log(skb, state, "bad checksum"); + return true; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { - tcp_error_log(skb, net, pf, "invalid tcp flag combination"); - return -NF_ACCEPT; + tcp_error_log(skb, state, "invalid tcp flag combination"); + return true; } - return NF_ACCEPT; + return false; +} + +static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *th) +{ + enum tcp_conntrack new_state; + struct net *net = nf_ct_net(ct); + const struct nf_tcp_net *tn = tcp_pernet(net); + const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; + const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + pr_debug("nf_ct_tcp: invalid new deleting.\n"); + return false; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* SYN packet */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + } else if (tn->tcp_loose == 0) { + /* Don't try to pick up connections. */ + return false; + } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + ct->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + dataoff, th); + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end + + ct->proto.tcp.seen[0].td_maxwin; + + /* We assume SACK and liberal window checking to handle + * window scaling */ + ct->proto.tcp.seen[0].flags = + ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | + IP_CT_TCP_FLAG_BE_LIBERAL; + } + + /* tcp_packet will set them */ + ct->proto.tcp.last_index = TCP_NONE_SET; + + pr_debug("%s: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + __func__, + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return true; } /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); @@ -786,7 +851,14 @@ static int tcp_packet(struct nf_conn *ct, unsigned long timeout; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); + if (th == NULL) + return -NF_ACCEPT; + + if (tcp_error(th, skb, dataoff, state)) + return -NF_ACCEPT; + + if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th)) + return -NF_ACCEPT; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; @@ -1067,82 +1139,6 @@ static int tcp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - enum tcp_conntrack new_state; - const struct tcphdr *th; - struct tcphdr _tcph; - struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = tcp_pernet(net); - const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; - const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; - - th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); - BUG_ON(th == NULL); - - /* Don't need lock here: this conntrack not in circulation yet */ - new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; - - /* Invalid: delete conntrack */ - if (new_state >= TCP_CONNTRACK_MAX) { - pr_debug("nf_ct_tcp: invalid new deleting.\n"); - return false; - } - - if (new_state == TCP_CONNTRACK_SYN_SENT) { - memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); - /* SYN packet */ - ct->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - dataoff, th); - ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (ct->proto.tcp.seen[0].td_maxwin == 0) - ct->proto.tcp.seen[0].td_maxwin = 1; - ct->proto.tcp.seen[0].td_maxend = - ct->proto.tcp.seen[0].td_end; - - tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); - } else if (tn->tcp_loose == 0) { - /* Don't try to pick up connections. */ - return false; - } else { - memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); - /* - * We are in the middle of a connection, - * its history is lost for us. - * Let's try to use the data from the packet. - */ - ct->proto.tcp.seen[0].td_end = - segment_seq_plus_len(ntohl(th->seq), skb->len, - dataoff, th); - ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (ct->proto.tcp.seen[0].td_maxwin == 0) - ct->proto.tcp.seen[0].td_maxwin = 1; - ct->proto.tcp.seen[0].td_maxend = - ct->proto.tcp.seen[0].td_end + - ct->proto.tcp.seen[0].td_maxwin; - - /* We assume SACK and liberal window checking to handle - * window scaling */ - ct->proto.tcp.seen[0].flags = - ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | - IP_CT_TCP_FLAG_BE_LIBERAL; - } - - /* tcp_packet will set them */ - ct->proto.tcp.last_index = TCP_NONE_SET; - - pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); - return true; -} - static bool tcp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.tcp.state) { @@ -1510,7 +1506,7 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int tcp_init_net(struct net *net, u_int16_t proto) +static int tcp_init_net(struct net *net) { struct nf_tcp_net *tn = tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; @@ -1538,16 +1534,13 @@ static struct nf_proto_net *tcp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.tcp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = { - .l3proto = PF_INET, .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, - .new = tcp_new, - .error = tcp_error, .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, @@ -1571,39 +1564,3 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = .init_net = tcp_init_net, .get_net_proto = tcp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = -{ - .l3proto = PF_INET6, - .l4proto = IPPROTO_TCP, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = tcp_print_conntrack, -#endif - .packet = tcp_packet, - .new = tcp_new, - .error = tcp_error, - .can_early_drop = tcp_can_early_drop, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = TCP_NLATTR_SIZE, - .to_nlattr = tcp_to_nlattr, - .from_nlattr = nlattr_to_tcp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = tcp_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = tcp_timeout_nlattr_to_obj, - .obj_to_nlattr = tcp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_TCP_MAX, - .obj_size = sizeof(unsigned int) * - TCP_CONNTRACK_TIMEOUT_MAX, - .nla_policy = tcp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = tcp_init_net, - .get_net_proto = tcp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3065fb8ef91b..a7aa70370913 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -42,14 +42,65 @@ static unsigned int *udp_get_timeouts(struct net *net) return udp_pernet(net)->timeouts; } +static void udp_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) +{ + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_UDP, "%s", msg); +} + +static bool udp_error(struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + unsigned int udplen = skb->len - dataoff; + const struct udphdr *hdr; + struct udphdr _hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (!hdr) { + udp_error_log(skb, state, "short packet"); + return true; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + udp_error_log(skb, state, "truncated/malformed packet"); + return true; + } + + /* Packet with no checksum */ + if (!hdr->check) + return false; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the checksum is assumed to be correct. + * FIXME: Source route IP option packets --RR */ + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_checksum(skb, state->hook, dataoff, IPPROTO_UDP, state->pf)) { + udp_error_log(skb, state, "bad checksum"); + return true; + } + + return false; +} + /* Returns verdict for packet, and may modify conntracktype */ static int udp_packet(struct nf_conn *ct, - const struct sk_buff *skb, + struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info ctinfo) + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { unsigned int *timeouts; + if (udp_error(skb, dataoff, state)) + return -NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = udp_get_timeouts(nf_ct_net(ct)); @@ -69,24 +120,18 @@ static int udp_packet(struct nf_conn *ct, return NF_ACCEPT; } -/* Called when a new connection for this protocol found. */ -static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, - unsigned int dataoff) -{ - return true; -} - #ifdef CONFIG_NF_CT_PROTO_UDPLITE -static void udplite_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) +static void udplite_error_log(const struct sk_buff *skb, + const struct nf_hook_state *state, + const char *msg) { - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDPLITE, "%s", msg); + nf_l4proto_log_invalid(skb, state->net, state->pf, + IPPROTO_UDPLITE, "%s", msg); } -static int udplite_error(struct net *net, struct nf_conn *tmpl, - struct sk_buff *skb, - unsigned int dataoff, - u8 pf, unsigned int hooknum) +static bool udplite_error(struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) { unsigned int udplen = skb->len - dataoff; const struct udphdr *hdr; @@ -96,80 +141,67 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (!hdr) { - udplite_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; + udplite_error_log(skb, state, "short packet"); + return true; } cscov = ntohs(hdr->len); if (cscov == 0) { cscov = udplen; } else if (cscov < sizeof(*hdr) || cscov > udplen) { - udplite_error_log(skb, net, pf, "invalid checksum coverage"); - return -NF_ACCEPT; + udplite_error_log(skb, state, "invalid checksum coverage"); + return true; } /* UDPLITE mandates checksums */ if (!hdr->check) { - udplite_error_log(skb, net, pf, "checksum missing"); - return -NF_ACCEPT; + udplite_error_log(skb, state, "checksum missing"); + return true; } /* Checksum invalid? Ignore. */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, - pf)) { - udplite_error_log(skb, net, pf, "bad checksum"); - return -NF_ACCEPT; + if (state->hook == NF_INET_PRE_ROUTING && + state->net->ct.sysctl_checksum && + nf_checksum_partial(skb, state->hook, dataoff, cscov, IPPROTO_UDP, + state->pf)) { + udplite_error_log(skb, state, "bad checksum"); + return true; } - return NF_ACCEPT; -} -#endif - -static void udp_error_log(const struct sk_buff *skb, struct net *net, - u8 pf, const char *msg) -{ - nf_l4proto_log_invalid(skb, net, pf, IPPROTO_UDP, "%s", msg); + return false; } -static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, - unsigned int dataoff, - u_int8_t pf, - unsigned int hooknum) +/* Returns verdict for packet, and may modify conntracktype */ +static int udplite_packet(struct nf_conn *ct, + struct sk_buff *skb, + unsigned int dataoff, + enum ip_conntrack_info ctinfo, + const struct nf_hook_state *state) { - unsigned int udplen = skb->len - dataoff; - const struct udphdr *hdr; - struct udphdr _hdr; - - /* Header is too small? */ - hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); - if (hdr == NULL) { - udp_error_log(skb, net, pf, "short packet"); - return -NF_ACCEPT; - } + unsigned int *timeouts; - /* Truncated/malformed packets */ - if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - udp_error_log(skb, net, pf, "truncated/malformed packet"); + if (udplite_error(skb, dataoff, state)) return -NF_ACCEPT; - } - /* Packet with no checksum */ - if (!hdr->check) - return NF_ACCEPT; + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = udp_get_timeouts(nf_ct_net(ct)); - /* Checksum invalid? Ignore. - * We skip checking packets on the outgoing path - * because the checksum is assumed to be correct. - * FIXME: Source route IP option packets --RR */ - if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && - nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - udp_error_log(skb, net, pf, "bad checksum"); - return -NF_ACCEPT; + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_REPLIED]); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_ASSURED, ct); + } else { + nf_ct_refresh_acct(ct, ctinfo, skb, + timeouts[UDP_CT_UNREPLIED]); } - return NF_ACCEPT; } +#endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT @@ -258,7 +290,7 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int udp_init_net(struct net *net, u_int16_t proto) +static int udp_init_net(struct net *net) { struct nf_udp_net *un = udp_pernet(net); struct nf_proto_net *pn = &un->pn; @@ -278,72 +310,11 @@ static struct nf_proto_net *udp_get_net_proto(struct net *net) return &net->ct.nf_ct_proto.udp.pn; } -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_UDP, - .allow_clash = true, - .packet = udp_packet, - .new = udp_new, - .error = udp_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = udp_timeout_nlattr_to_obj, - .obj_to_nlattr = udp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDP_MAX, - .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, - .nla_policy = udp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = udp_init_net, - .get_net_proto = udp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp4); - -#ifdef CONFIG_NF_CT_PROTO_UDPLITE -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 = -{ - .l3proto = PF_INET, - .l4proto = IPPROTO_UDPLITE, - .allow_clash = true, - .packet = udp_packet, - .new = udp_new, - .error = udplite_error, -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = udp_timeout_nlattr_to_obj, - .obj_to_nlattr = udp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_UDP_MAX, - .obj_size = sizeof(unsigned int) * CTA_TIMEOUT_UDP_MAX, - .nla_policy = udp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - .init_net = udp_init_net, - .get_net_proto = udp_get_net_proto, -}; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite4); -#endif - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp = { - .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .allow_clash = true, .packet = udp_packet, - .new = udp_new, - .error = udp_error, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, @@ -362,17 +333,13 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 = .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); #ifdef CONFIG_NF_CT_PROTO_UDPLITE -const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = +const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite = { - .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .allow_clash = true, - .packet = udp_packet, - .new = udp_new, - .error = udplite_error, + .packet = udplite_packet, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, @@ -391,5 +358,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 = .init_net = udp_init_net, .get_net_proto = udp_get_net_proto, }; -EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6); #endif diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 13279f683da9..463d17d349c1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -292,7 +292,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (!net_eq(nf_ct_net(ct), net)) goto release; - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); WARN_ON(!l4proto); ret = -ENOSPC; @@ -720,10 +720,3 @@ static void __exit nf_conntrack_standalone_fini(void) module_init(nf_conntrack_standalone_init); module_exit(nf_conntrack_standalone_fini); - -/* Some modules need us, but don't depend directly on any symbol. - They should call this. */ -void need_conntrack(void) -{ -} -EXPORT_SYMBOL_GPL(need_conntrack); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index d8125616edc7..185c633b6872 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -120,7 +120,7 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct) if (l4num == IPPROTO_TCP) flow_offload_fixup_tcp(&ct->proto.tcp); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num); + l4proto = __nf_ct_l4proto_find(l4num); if (!l4proto) return; @@ -233,8 +233,8 @@ flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload *flow; int dir; - tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple, - nf_flow_offload_rhash_params); + tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, + nf_flow_offload_rhash_params); if (!tuplehash) return NULL; @@ -254,20 +254,17 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; - int err; - - err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); - if (err) - return err; + int err = 0; + rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { - err = PTR_ERR(tuplehash); - if (err != -EAGAIN) - goto out; - + if (PTR_ERR(tuplehash) != -EAGAIN) { + err = PTR_ERR(tuplehash); + break; + } continue; } if (tuplehash->tuple.dir) @@ -277,7 +274,6 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, iter(flow, data); } -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); @@ -290,25 +286,19 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) return (__s32)(flow->timeout - (u32)jiffies) <= 0; } -static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) +static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; - int err; - - err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL); - if (err) - return 0; + rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { - err = PTR_ERR(tuplehash); - if (err != -EAGAIN) - goto out; - + if (PTR_ERR(tuplehash) != -EAGAIN) + break; continue; } if (tuplehash->tuple.dir) @@ -321,11 +311,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) FLOW_OFFLOAD_TEARDOWN))) flow_offload_del(flow_table, flow); } -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); - - return 1; } static void nf_flow_offload_work_gc(struct work_struct *work) @@ -514,7 +501,7 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); - WARN_ON(!nf_flow_offload_gc_step(flow_table)); + nf_flow_offload_gc_step(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 15ed91309992..1d291a51cd45 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -254,8 +254,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ip(flow, skb, thoff, dir) < 0) + if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; @@ -471,8 +470,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (skb_try_make_writable(skb, sizeof(*ip6h))) return NF_DROP; - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ipv6(flow, skb, dir) < 0) + if (nf_flow_nat_ipv6(flow, skb, dir) < 0) return NF_DROP; flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 99606baedda4..38793b95d9bc 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -37,7 +37,7 @@ static void mangle_contents(struct sk_buff *skb, { unsigned char *data; - BUG_ON(skb_is_nonlinear(skb)); + SKB_LINEAR_ASSERT(skb); data = skb_network_header(skb) + dataoff; /* move post-replacement */ @@ -110,8 +110,6 @@ bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, !enlarge_skb(skb, rep_len - match_len)) return false; - SKB_LINEAR_ASSERT(skb); - tcph = (void *)skb->data + protoff; oldlen = skb->len - protoff; diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index adee04af8d43..78a9e6454ff3 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -52,13 +52,11 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, newdst = 0; - rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); if (indev && indev->ifa_list) { ifa = indev->ifa_list; newdst = ifa->ifa_local; } - rcu_read_unlock(); if (!newdst) return NF_DROP; @@ -97,7 +95,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, struct inet6_ifaddr *ifa; bool addr = false; - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (idev != NULL) { read_lock_bh(&idev->lock); @@ -108,7 +105,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } read_unlock_bh(&idev->lock); } - rcu_read_unlock(); if (!addr) return NF_DROP; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2cfb173cd0b2..f0159eea2978 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -27,6 +27,8 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); +static LIST_HEAD(nf_tables_destroy_list); +static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); static u64 table_handle; enum { @@ -64,6 +66,8 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) net->nft.validate_state = new_validate_state; } +static void nf_tables_trans_destroy_work(struct work_struct *w); +static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, @@ -207,6 +211,18 @@ static int nft_delchain(struct nft_ctx *ctx) return err; } +/* either expr ops provide both activate/deactivate, or neither */ +static bool nft_expr_check_ops(const struct nft_expr_ops *ops) +{ + if (!ops) + return true; + + if (WARN_ON_ONCE((!ops->activate ^ !ops->deactivate))) + return false; + + return true; +} + static void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule) { @@ -298,7 +314,7 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) return 0; } -static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, +static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { struct nft_trans *trans; @@ -318,7 +334,7 @@ static int nft_trans_set_add(struct nft_ctx *ctx, int msg_type, return 0; } -static int nft_delset(struct nft_ctx *ctx, struct nft_set *set) +static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set) { int err; @@ -1005,7 +1021,8 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, static void nf_tables_table_destroy(struct nft_ctx *ctx) { - BUG_ON(ctx->table->use > 0); + if (WARN_ON(ctx->table->use > 0)) + return; rhltable_destroy(&ctx->table->chains_ht); kfree(ctx->table->name); @@ -1412,7 +1429,8 @@ static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; - BUG_ON(chain->use > 0); + if (WARN_ON(chain->use > 0)) + return; /* no concurrent access possible anymore */ nf_tables_chain_free_chain_rules(chain); @@ -1907,6 +1925,9 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, */ int nft_register_expr(struct nft_expr_type *type) { + if (!nft_expr_check_ops(type->ops)) + return -EINVAL; + nfnl_lock(NFNL_SUBSYS_NFTABLES); if (type->family == NFPROTO_UNSPEC) list_add_tail_rcu(&type->list, &nf_tables_expressions); @@ -2054,6 +2075,10 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, err = PTR_ERR(ops); goto err1; } + if (!nft_expr_check_ops(ops)) { + err = -EINVAL; + goto err1; + } } else ops = type->ops; @@ -2434,7 +2459,6 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, { struct nft_expr *expr; - lockdep_assert_held(&ctx->net->nft.commit_mutex); /* * Careful: some expressions might not be initialized in case this * is called on error from nf_tables_newrule(). @@ -3567,13 +3591,6 @@ static void nft_set_destroy(struct nft_set *set) kvfree(set); } -static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) -{ - list_del_rcu(&set->list); - nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); - nft_set_destroy(set); -} - static int nf_tables_delset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -3668,17 +3685,38 @@ bind: } EXPORT_SYMBOL_GPL(nf_tables_bind_set); -void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, +void nf_tables_rebind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { + if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && + nft_is_active(ctx->net, set)) + list_add_tail_rcu(&set->list, &ctx->table->sets); + + list_add_tail_rcu(&binding->list, &set->bindings); +} +EXPORT_SYMBOL_GPL(nf_tables_rebind_set); + +void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_binding *binding) +{ list_del_rcu(&binding->list); if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && nft_is_active(ctx->net, set)) - nf_tables_set_destroy(ctx, set); + list_del_rcu(&set->list); } EXPORT_SYMBOL_GPL(nf_tables_unbind_set); +void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set) +{ + if (list_empty(&set->bindings) && nft_set_is_anonymous(set) && + nft_is_active(ctx->net, set)) { + nf_tables_set_notify(ctx, set, NFT_MSG_DELSET, GFP_ATOMIC); + nft_set_destroy(set); + } +} +EXPORT_SYMBOL_GPL(nf_tables_destroy_set); + const struct nft_set_ext_type nft_set_ext_types[] = { [NFT_SET_EXT_KEY] = { .align = __alignof__(u32), @@ -6191,19 +6229,28 @@ static void nft_commit_release(struct nft_trans *trans) nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } + + if (trans->put_net) + put_net(trans->ctx.net); + kfree(trans); } -static void nf_tables_commit_release(struct net *net) +static void nf_tables_trans_destroy_work(struct work_struct *w) { struct nft_trans *trans, *next; + LIST_HEAD(head); - if (list_empty(&net->nft.commit_list)) + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_destroy_list, &head); + spin_unlock(&nf_tables_destroy_list_lock); + + if (list_empty(&head)) return; synchronize_rcu(); - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &head, list) { list_del(&trans->list); nft_commit_release(trans); } @@ -6334,6 +6381,37 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nf_tables_commit_release(struct net *net) +{ + struct nft_trans *trans; + + /* all side effects have to be made visible. + * For example, if a chain named 'foo' has been deleted, a + * new transaction must not find it anymore. + * + * Memory reclaim happens asynchronously from work queue + * to prevent expensive synchronize_rcu() in commit phase. + */ + if (list_empty(&net->nft.commit_list)) { + mutex_unlock(&net->nft.commit_mutex); + return; + } + + trans = list_last_entry(&net->nft.commit_list, + struct nft_trans, list); + get_net(trans->ctx.net); + WARN_ON_ONCE(trans->put_net); + + trans->put_net = true; + spin_lock(&nf_tables_destroy_list_lock); + list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); + spin_unlock(&nf_tables_destroy_list_lock); + + mutex_unlock(&net->nft.commit_mutex); + + schedule_work(&trans_destroy_work); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; @@ -6495,9 +6573,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } } - nf_tables_commit_release(net); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - mutex_unlock(&net->nft.commit_mutex); + nf_tables_commit_release(net); return 0; } @@ -7168,7 +7245,8 @@ int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; - BUG_ON(!nft_is_base_chain(ctx->chain)); + if (WARN_ON(!nft_is_base_chain(ctx->chain))) + return 0; nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { @@ -7271,6 +7349,7 @@ static int __init nf_tables_module_init(void) { int err; + spin_lock_init(&nf_tables_destroy_list_lock); err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; @@ -7310,6 +7389,7 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_destroy_work); rcu_barrier(); nf_tables_core_module_exit(); } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index ffd5c0f9412b..3fbce3b9c5ec 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -249,12 +249,24 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_exthdr_type, }; +static struct nft_object_type *nft_basic_objects[] = { +#ifdef CONFIG_NETWORK_SECMARK + &nft_secmark_obj_type, +#endif +}; + int __init nf_tables_core_module_init(void) { - int err, i; + int err, i, j = 0; + + for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) { + err = nft_register_obj(nft_basic_objects[i]); + if (err) + goto err; + } - for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) { - err = nft_register_expr(nft_basic_types[i]); + for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) { + err = nft_register_expr(nft_basic_types[j]); if (err) goto err; } @@ -262,8 +274,12 @@ int __init nf_tables_core_module_init(void) return 0; err: + while (j-- > 0) + nft_unregister_expr(nft_basic_types[j]); + while (i-- > 0) - nft_unregister_expr(nft_basic_types[i]); + nft_unregister_obj(nft_basic_objects[i]); + return err; } @@ -274,4 +290,8 @@ void nf_tables_core_module_exit(void) i = ARRAY_SIZE(nft_basic_types); while (i-- > 0) nft_unregister_expr(nft_basic_types[i]); + + i = ARRAY_SIZE(nft_basic_objects); + while (i-- > 0) + nft_unregister_obj(nft_basic_objects[i]); } diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index a30f8ba4b89a..b48545b84ce8 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -53,9 +53,6 @@ ctnl_timeout_parse_policy(void *timeout, struct nlattr **tb; int ret = 0; - if (!l4proto->ctnl_timeout.nlattr_to_obj) - return 0; - tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); @@ -125,7 +122,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, return -EBUSY; } - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); /* This protocol is not supportted, skip. */ if (l4proto->l4proto != l4num) { @@ -167,6 +164,8 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; + struct nlattr *nest_parms; + int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -186,22 +185,15 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; - if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { - struct nlattr *nest_parms; - int ret; - - nest_parms = nla_nest_start(skb, - CTA_TIMEOUT_DATA | NLA_F_NESTED); - if (!nest_parms) - goto nla_put_failure; + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, - &timeout->timeout.data); - if (ret < 0) - goto nla_put_failure; + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->timeout.data); + if (ret < 0) + goto nla_put_failure; - nla_nest_end(skb, nest_parms); - } + nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; @@ -369,7 +361,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { @@ -391,12 +383,14 @@ err: static int cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, - u32 seq, u32 type, int event, + u32 seq, u32 type, int event, u16 l3num, const struct nf_conntrack_l4proto *l4proto) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; + struct nlattr *nest_parms; + int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); @@ -408,25 +402,19 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; - if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l4proto->l3proto)) || + if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; - if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { - struct nlattr *nest_parms; - int ret; - - nest_parms = nla_nest_start(skb, - CTA_TIMEOUT_DATA | NLA_F_NESTED); - if (!nest_parms) - goto nla_put_failure; + nest_parms = nla_nest_start(skb, CTA_TIMEOUT_DATA | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; - ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); - if (ret < 0) - goto nla_put_failure; + ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL); + if (ret < 0) + goto nla_put_failure; - nla_nest_end(skb, nest_parms); - } + nla_nest_end(skb, nest_parms); nlmsg_end(skb, nlh); return skb->len; @@ -454,7 +442,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); /* This protocol is not supported, skip. */ if (l4proto->l4proto != l4num) { @@ -472,6 +460,7 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, + l3num, l4proto); if (ret <= 0) { kfree_skb(skb2); diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index fa90a8402845..79d48c1d06f4 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -79,7 +79,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &desc, tb[NFTA_CMP_DATA]); - BUG_ON(err < 0); + if (err < 0) + return err; priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); @@ -129,7 +130,8 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, err = nft_data_init(NULL, &data, sizeof(data), &desc, tb[NFTA_CMP_DATA]); - BUG_ON(err < 0); + if (err < 0) + return err; priv->sreg = nft_parse_register(tb[NFTA_CMP_SREG]); err = nft_validate_register_load(priv->sreg, desc.len); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 5dd87748afa8..586627c361df 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -279,7 +279,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; -#ifdef CONFIG_NF_CONNTRACK_MARK +#if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK) u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; @@ -298,6 +298,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + if (ct->secmark != value) { + ct->secmark = value; + nf_conntrack_event_cache(IPCT_SECMARK, ct); + } + break; +#endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_replace(ct, @@ -565,6 +573,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, len = sizeof(u32); break; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + case NFT_CT_SECMARK: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); + break; +#endif default: return -EOPNOTSUPP; } @@ -776,9 +791,6 @@ nft_ct_timeout_parse_policy(void *timeouts, struct nlattr **tb; int ret = 0; - if (!l4proto->ctnl_timeout.nlattr_to_obj) - return 0; - tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); @@ -858,7 +870,7 @@ static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; - l4proto = nf_ct_l4proto_find_get(l3num, l4num); + l4proto = nf_ct_l4proto_find_get(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6e91a37d57f2..07d4efd3d851 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -235,14 +235,31 @@ err1: return err; } +static void nft_dynset_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_rebind_set(ctx, priv->set, &priv->binding); +} + +static void nft_dynset_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_dynset *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + static void nft_dynset_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_dynset *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); if (priv->expr != NULL) nft_expr_destroy(ctx, priv->expr); + + nf_tables_destroy_set(ctx, priv->set); } static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -279,6 +296,8 @@ static const struct nft_expr_ops nft_dynset_ops = { .eval = nft_dynset_eval, .init = nft_dynset_init, .destroy = nft_dynset_destroy, + .activate = nft_dynset_activate, + .deactivate = nft_dynset_deactivate, .dump = nft_dynset_dump, }; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index ad13e8643599..227b2b15a19c 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -121,12 +121,28 @@ static int nft_lookup_init(const struct nft_ctx *ctx, return 0; } +static void nft_lookup_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + + nf_tables_rebind_set(ctx, priv->set, &priv->binding); +} + +static void nft_lookup_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_lookup *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + static void nft_lookup_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_lookup *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + nf_tables_destroy_set(ctx, priv->set); } static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) @@ -209,6 +225,8 @@ static const struct nft_expr_ops nft_lookup_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), .eval = nft_lookup_eval, .init = nft_lookup_init, + .activate = nft_lookup_activate, + .deactivate = nft_lookup_deactivate, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, .validate = nft_lookup_validate, diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 297fe7d97c18..6180626c3f80 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -284,6 +284,11 @@ static void nft_meta_set_eval(const struct nft_expr *expr, skb->nf_trace = !!value8; break; +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: + skb->secmark = value; + break; +#endif default: WARN_ON(1); } @@ -436,6 +441,9 @@ static int nft_meta_set_init(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_MARK: case NFT_META_PRIORITY: +#ifdef CONFIG_NETWORK_SECMARK + case NFT_META_SECMARK: +#endif len = sizeof(u32); break; case NFT_META_NFTRACE: @@ -543,3 +551,111 @@ struct nft_expr_type nft_meta_type __read_mostly = { .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; + +#ifdef CONFIG_NETWORK_SECMARK +struct nft_secmark { + u32 secid; + char *ctx; +}; + +static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = { + [NFTA_SECMARK_CTX] = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN }, +}; + +static int nft_secmark_compute_secid(struct nft_secmark *priv) +{ + u32 tmp_secid = 0; + int err; + + err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid); + if (err) + return err; + + if (!tmp_secid) + return -ENOENT; + + err = security_secmark_relabel_packet(tmp_secid); + if (err) + return err; + + priv->secid = tmp_secid; + return 0; +} + +static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_secmark *priv = nft_obj_data(obj); + struct sk_buff *skb = pkt->skb; + + skb->secmark = priv->secid; +} + +static int nft_secmark_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_secmark *priv = nft_obj_data(obj); + int err; + + if (tb[NFTA_SECMARK_CTX] == NULL) + return -EINVAL; + + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + if (!priv->ctx) + return -ENOMEM; + + err = nft_secmark_compute_secid(priv); + if (err) { + kfree(priv->ctx); + return err; + } + + security_secmark_refcount_inc(); + + return 0; +} + +static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj, + bool reset) +{ + struct nft_secmark *priv = nft_obj_data(obj); + int err; + + if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx)) + return -1; + + if (reset) { + err = nft_secmark_compute_secid(priv); + if (err) + return err; + } + + return 0; +} + +static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) +{ + struct nft_secmark *priv = nft_obj_data(obj); + + security_secmark_refcount_dec(); + + kfree(priv->ctx); +} + +static const struct nft_object_ops nft_secmark_obj_ops = { + .type = &nft_secmark_obj_type, + .size = sizeof(struct nft_secmark), + .init = nft_secmark_obj_init, + .eval = nft_secmark_obj_eval, + .dump = nft_secmark_obj_dump, + .destroy = nft_secmark_obj_destroy, +}; +struct nft_object_type nft_secmark_obj_type __read_mostly = { + .type = NFT_OBJECT_SECMARK, + .ops = &nft_secmark_obj_ops, + .maxattr = NFTA_SECMARK_MAX, + .policy = nft_secmark_policy, + .owner = THIS_MODULE, +}; +#endif /* CONFIG_NETWORK_SECMARK */ diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index cdf348f751ec..a3185ca2a3a9 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -155,12 +155,28 @@ nla_put_failure: return -1; } +static void nft_objref_map_activate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + + nf_tables_rebind_set(ctx, priv->set, &priv->binding); +} + +static void nft_objref_map_deactivate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_objref_map *priv = nft_expr_priv(expr); + + nf_tables_unbind_set(ctx, priv->set, &priv->binding); +} + static void nft_objref_map_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_objref_map *priv = nft_expr_priv(expr); - nf_tables_unbind_set(ctx, priv->set, &priv->binding); + nf_tables_destroy_set(ctx, priv->set); } static struct nft_expr_type nft_objref_type; @@ -169,6 +185,8 @@ static const struct nft_expr_ops nft_objref_map_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), .eval = nft_objref_map_eval, .init = nft_objref_map_init, + .activate = nft_objref_map_activate, + .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, }; diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 29f5bd2377b0..b48e58cceeb7 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -94,7 +94,8 @@ static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { int nft_reject_icmp_code(u8 code) { - BUG_ON(code > NFT_REJECT_ICMPX_MAX); + if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) + return ICMP_NET_UNREACH; return icmp_code_v4[code]; } @@ -111,7 +112,8 @@ static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { int nft_reject_icmpv6_code(u8 code) { - BUG_ON(code > NFT_REJECT_ICMPX_MAX); + if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) + return ICMPV6_NOROUTE; return icmp_code_v6[code]; } diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 76dba9f6b6f6..f35fa33913ae 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -90,6 +90,11 @@ static void nft_rt_get_eval(const struct nft_expr *expr, case NFT_RT_TCPMSS: nft_reg_store16(dest, get_tcpmss(pkt, dst)); break; +#ifdef CONFIG_XFRM + case NFT_RT_XFRM: + nft_reg_store8(dest, !!dst->xfrm); + break; +#endif default: WARN_ON(1); goto err; @@ -130,6 +135,11 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, case NFT_RT_TCPMSS: len = sizeof(u16); break; +#ifdef CONFIG_XFRM + case NFT_RT_XFRM: + len = sizeof(u8); + break; +#endif default: return -EOPNOTSUPP; } @@ -164,6 +174,7 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: case NFT_RT_CLASSID: + case NFT_RT_XFRM: return 0; case NFT_RT_TCPMSS: hooks = (1 << NF_INET_FORWARD) | diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 015124e649cb..339a9dd1c832 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -88,7 +88,7 @@ static bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, .key = key, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) *ext = &he->ext; @@ -106,7 +106,7 @@ static void *nft_rhash_get(const struct net *net, const struct nft_set *set, .key = elem->key.val.data, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) return he; @@ -129,7 +129,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, .key = key, }; - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) goto out; @@ -217,7 +217,7 @@ static void *nft_rhash_deactivate(const struct net *net, }; rcu_read_lock(); - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_rhash_params); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL && !nft_rhash_flush(net, set, he)) he = NULL; @@ -244,21 +244,15 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_rhash_elem *he; struct rhashtable_iter hti; struct nft_set_elem elem; - int err; - - err = rhashtable_walk_init(&priv->ht, &hti, GFP_ATOMIC); - iter->err = err; - if (err) - return; + rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { - err = PTR_ERR(he); - if (err != -EAGAIN) { - iter->err = err; - goto out; + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; } continue; @@ -275,13 +269,11 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) - goto out; + break; cont: iter->count++; } - -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); } @@ -293,21 +285,17 @@ static void nft_rhash_gc(struct work_struct *work) struct nft_rhash *priv; struct nft_set_gc_batch *gcb = NULL; struct rhashtable_iter hti; - int err; priv = container_of(work, struct nft_rhash, gc_work.work); set = nft_set_container_of(priv); - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); - if (err) - goto schedule; - + rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); while ((he = rhashtable_walk_next(&hti))) { if (IS_ERR(he)) { if (PTR_ERR(he) != -EAGAIN) - goto out; + break; continue; } @@ -326,17 +314,15 @@ gc: gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); if (gcb == NULL) - goto out; + break; rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, he); } -out: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); nft_set_gc_batch_complete(gcb); -schedule: queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); } diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c new file mode 100644 index 000000000000..3cf71a2e375b --- /dev/null +++ b/net/netfilter/nft_xfrm.c @@ -0,0 +1,293 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Generic part shared by ipv4 and ipv6 backends. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <linux/in.h> +#include <net/xfrm.h> + +static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { + [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_DIR] = { .type = NLA_U8 }, + [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_DREG] = { .type = NLA_U32 }, +}; + +struct nft_xfrm { + enum nft_xfrm_keys key:8; + enum nft_registers dreg:8; + u8 dir; + u8 spnum; +}; + +static int nft_xfrm_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int len = 0; + u32 spnum = 0; + u8 dir; + + if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) + return -EINVAL; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + + priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY])); + switch (priv->key) { + case NFT_XFRM_KEY_REQID: + case NFT_XFRM_KEY_SPI: + len = sizeof(u32); + break; + case NFT_XFRM_KEY_DADDR_IP4: + case NFT_XFRM_KEY_SADDR_IP4: + len = sizeof(struct in_addr); + break; + case NFT_XFRM_KEY_DADDR_IP6: + case NFT_XFRM_KEY_SADDR_IP6: + len = sizeof(struct in6_addr); + break; + default: + return -EINVAL; + } + + dir = nla_get_u8(tb[NFTA_XFRM_DIR]); + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_OUT: + priv->dir = dir; + break; + default: + return -EINVAL; + } + + if (tb[NFTA_XFRM_SPNUM]) + spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); + + if (spnum >= XFRM_MAX_DEPTH) + return -ERANGE; + + priv->spnum = spnum; + + priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +/* Return true if key asks for daddr/saddr and current + * state does have a valid address (BEET, TUNNEL). + */ +static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) +{ + switch (k) { + case NFT_XFRM_KEY_DADDR_IP4: + case NFT_XFRM_KEY_SADDR_IP4: + if (family == NFPROTO_IPV4) + break; + return false; + case NFT_XFRM_KEY_DADDR_IP6: + case NFT_XFRM_KEY_SADDR_IP6: + if (family == NFPROTO_IPV6) + break; + return false; + default: + return true; + } + + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; +} + +static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct xfrm_state *state, + u8 family) +{ + u32 *dest = ®s->data[priv->dreg]; + + if (!xfrm_state_addr_ok(priv->key, family, state->props.mode)) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_XFRM_KEY_UNSPEC: + case __NFT_XFRM_KEY_MAX: + WARN_ON_ONCE(1); + break; + case NFT_XFRM_KEY_DADDR_IP4: + *dest = state->id.daddr.a4; + return; + case NFT_XFRM_KEY_DADDR_IP6: + memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); + return; + case NFT_XFRM_KEY_SADDR_IP4: + *dest = state->props.saddr.a4; + return; + case NFT_XFRM_KEY_SADDR_IP6: + memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); + return; + case NFT_XFRM_KEY_REQID: + *dest = state->props.reqid; + return; + case NFT_XFRM_KEY_SPI: + *dest = state->id.spi; + return; + } + + regs->verdict.code = NFT_BREAK; +} + +static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct sec_path *sp = pkt->skb->sp; + const struct xfrm_state *state; + + if (sp == NULL || sp->len <= priv->spnum) { + regs->verdict.code = NFT_BREAK; + return; + } + + state = sp->xvec[priv->spnum]; + nft_xfrm_state_get_key(priv, regs, state, nft_pf(pkt)); +} + +static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct dst_entry *dst = skb_dst(pkt->skb); + int i; + + for (i = 0; dst && dst->xfrm; + dst = ((const struct xfrm_dst *)dst)->child, i++) { + if (i < priv->spnum) + continue; + + nft_xfrm_state_get_key(priv, regs, dst->xfrm, nft_pf(pkt)); + return; + } + + regs->verdict.code = NFT_BREAK; +} + +static void nft_xfrm_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + + switch (priv->dir) { + case XFRM_POLICY_IN: + nft_xfrm_get_eval_in(priv, regs, pkt); + break; + case XFRM_POLICY_OUT: + nft_xfrm_get_eval_out(priv, regs, pkt); + break; + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} + +static int nft_xfrm_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) + return -1; + + if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) + return -1; + if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) + return -1; + if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) + return -1; + + return 0; +} + +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int hooks; + + switch (priv->dir) { + case XFRM_POLICY_IN: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING); + break; + case XFRM_POLICY_OUT: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING); + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + + +static struct nft_expr_type nft_xfrm_type; +static const struct nft_expr_ops nft_xfrm_get_ops = { + .type = &nft_xfrm_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), + .eval = nft_xfrm_get_eval, + .init = nft_xfrm_get_init, + .dump = nft_xfrm_get_dump, + .validate = nft_xfrm_validate, +}; + +static struct nft_expr_type nft_xfrm_type __read_mostly = { + .name = "xfrm", + .ops = &nft_xfrm_get_ops, + .policy = nft_xfrm_policy, + .maxattr = NFTA_XFRM_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_xfrm_module_init(void) +{ + return nft_register_expr(&nft_xfrm_type); +} + +static void __exit nft_xfrm_module_exit(void) +{ + nft_unregister_expr(&nft_xfrm_type); +} + +module_init(nft_xfrm_module_init); +module_exit(nft_xfrm_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); +MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); +MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>"); +MODULE_ALIAS_NFT_EXPR("xfrm"); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 89457efd2e00..2c7a4b80206f 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -159,7 +159,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, /* Make sure the timeout policy matches any existing protocol tracker, * otherwise default to generic. */ - l4proto = __nf_ct_l4proto_find(par->family, proto); + l4proto = __nf_ct_l4proto_find(proto); if (timeout->l4proto->l4proto != l4proto->l4proto) { ret = -EINVAL; pr_info_ratelimited("Timeout policy `%s' can only be used by L%d protocol number %d\n", diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 5ee859193783..c6acfc2d9c84 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -68,8 +68,6 @@ struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; - BUG_ON(!label); - list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; @@ -172,8 +170,6 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); - BUG_ON(!info->timer); - mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 4ad5fe27e08b..f16202d26c20 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -35,8 +35,6 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) u32 secmark = 0; const struct xt_secmark_target_info *info = par->targinfo; - BUG_ON(info->mode != mode); - switch (mode) { case SECMARK_MODE_SEL: secmark = info->secid; diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 5d92e1781980..5cb1ecb29ea4 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -68,6 +68,38 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return 0; } +static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info_v2 *info = par->matchinfo; + struct cgroup *cgrp; + + if ((info->invert_path & ~1) || (info->invert_classid & ~1)) + return -EINVAL; + + if (!info->has_path && !info->has_classid) { + pr_info("xt_cgroup: no path or classid specified\n"); + return -EINVAL; + } + + if (info->has_path && info->has_classid) { + pr_info_ratelimited("path and classid specified\n"); + return -EINVAL; + } + + info->priv = NULL; + if (info->has_path) { + cgrp = cgroup_get_from_path(info->path); + if (IS_ERR(cgrp)) { + pr_info_ratelimited("invalid path, errno=%ld\n", + PTR_ERR(cgrp)); + return -EINVAL; + } + info->priv = cgrp; + } + + return 0; +} + static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { @@ -99,6 +131,24 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) info->invert_classid; } +static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info_v2 *info = par->matchinfo; + struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; + struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; + + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) + return false; + + if (ancestor) + return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ + info->invert_path; + else + return (info->classid == sock_cgroup_classid(skcd)) ^ + info->invert_classid; +} + static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; @@ -107,6 +157,14 @@ static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) cgroup_put(info->priv); } +static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par) +{ + struct xt_cgroup_info_v2 *info = par->matchinfo; + + if (info->priv) + cgroup_put(info->priv); +} + static struct xt_match cgroup_mt_reg[] __read_mostly = { { .name = "cgroup", @@ -134,6 +192,20 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = { (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, + { + .name = "cgroup", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v2, + .match = cgroup_mt_v2, + .matchsize = sizeof(struct xt_cgroup_info_v2), + .usersize = offsetof(struct xt_cgroup_info_v2, priv), + .destroy = cgroup_mt_destroy_v2, + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, }; static int __init cgroup_mt_init(void) diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 10d61a6eed71..fceae245eb03 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -11,11 +11,6 @@ #include <linux/netfilter/xt_quota.h> #include <linux/module.h> -struct xt_quota_priv { - spinlock_t lock; - uint64_t quota; -}; - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); MODULE_DESCRIPTION("Xtables: countdown quota match"); @@ -26,54 +21,48 @@ static bool quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct xt_quota_info *q = (void *)par->matchinfo; - struct xt_quota_priv *priv = q->master; + u64 current_count = atomic64_read(&q->counter); bool ret = q->flags & XT_QUOTA_INVERT; - - spin_lock_bh(&priv->lock); - if (priv->quota >= skb->len) { - priv->quota -= skb->len; - ret = !ret; - } else { - /* we do not allow even small packets from now on */ - priv->quota = 0; - } - spin_unlock_bh(&priv->lock); - - return ret; + u64 old_count, new_count; + + do { + if (current_count == 1) + return ret; + if (current_count <= skb->len) { + atomic64_set(&q->counter, 1); + return ret; + } + old_count = current_count; + new_count = current_count - skb->len; + current_count = atomic64_cmpxchg(&q->counter, old_count, + new_count); + } while (current_count != old_count); + return !ret; } static int quota_mt_check(const struct xt_mtchk_param *par) { struct xt_quota_info *q = par->matchinfo; + BUILD_BUG_ON(sizeof(atomic64_t) != sizeof(__u64)); + if (q->flags & ~XT_QUOTA_MASK) return -EINVAL; + if (atomic64_read(&q->counter) > q->quota + 1) + return -ERANGE; - q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); - if (q->master == NULL) - return -ENOMEM; - - spin_lock_init(&q->master->lock); - q->master->quota = q->quota; + if (atomic64_read(&q->counter) == 0) + atomic64_set(&q->counter, q->quota + 1); return 0; } -static void quota_mt_destroy(const struct xt_mtdtor_param *par) -{ - const struct xt_quota_info *q = par->matchinfo; - - kfree(q->master); -} - static struct xt_match quota_mt_reg __read_mostly = { .name = "quota", .revision = 0, .family = NFPROTO_UNSPEC, .match = quota_mt, .checkentry = quota_mt_check, - .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), - .usersize = offsetof(struct xt_quota_info, master), .me = THIS_MODULE, }; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e3a0538ec0be..e613a9f89600 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1706,6 +1706,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_EXT_ACK; err = 0; break; + case NETLINK_DUMP_STRICT_CHK: + if (val) + nlk->flags |= NETLINK_F_STRICT_CHK; + else + nlk->flags &= ~NETLINK_F_STRICT_CHK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1799,6 +1806,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_DUMP_STRICT_CHK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0; + if (put_user(len, optlen) || put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2171,6 +2187,7 @@ EXPORT_SYMBOL(__nlmsg_put); static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; @@ -2222,8 +2239,11 @@ static int netlink_dump(struct sock *sk) skb_reserve(skb, skb_tailroom(skb) - alloc_size); netlink_skb_set_owner_r(skb, sk); - if (nlk->dump_done_errno > 0) + if (nlk->dump_done_errno > 0) { + cb->extack = &extack; nlk->dump_done_errno = cb->dump(skb, cb); + cb->extack = NULL; + } if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { @@ -2246,6 +2266,12 @@ static int netlink_dump(struct sock *sk) memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); + if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) { + nlh->nlmsg_flags |= NLM_F_ACK_TLVS; + if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg)) + nlmsg_end(skb, nlh); + } + if (sk_filter(sk, skb)) kfree_skb(skb); else @@ -2272,9 +2298,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { + struct netlink_sock *nlk, *nlk2; struct netlink_callback *cb; struct sock *sk; - struct netlink_sock *nlk; int ret; refcount_inc(&skb->users); @@ -2308,6 +2334,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + nlk2 = nlk_sk(NETLINK_CB(skb).sk); + cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK); + if (control->start) { ret = control->start(cb); if (ret) diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 962de7b3c023..5f454c8de6a4 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -15,6 +15,7 @@ #define NETLINK_F_LISTEN_ALL_NSID 0x10 #define NETLINK_F_CAP_ACK 0x20 #define NETLINK_F_EXT_ACK 0x40 +#define NETLINK_F_STRICT_CHK 0x80 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 35ae64cbef33..6bec37ab4472 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -933,6 +933,11 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, struct nf_conn *ct; if (!cached) { + struct nf_hook_state state = { + .hook = NF_INET_PRE_ROUTING, + .pf = info->family, + .net = net, + }; struct nf_conn *tmpl = info->ct; int err; @@ -944,8 +949,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, nf_ct_set(skb, tmpl, IP_CT_NEW); } - err = nf_conntrack_in(net, info->family, - NF_INET_PRE_ROUTING, skb); + err = nf_conntrack_in(skb, &state); if (err != NF_ACCEPT) return -ENOENT; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 55153da00278..9c1b0729aebf 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1452,7 +1452,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) u32 act_count = 0; ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, - tcaa_policy, NULL); + tcaa_policy, cb->extack); if (ret < 0) return ret; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d670d3066ebd..43c8559aca56 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1727,7 +1727,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, + cb->extack); if (err) return err; @@ -2054,7 +2055,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, + cb->extack); if (err) return err; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index f218ccf1e2d9..ac79a40a0392 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -68,7 +68,6 @@ struct tc_u_knode { u32 mask; u32 __percpu *pcpu_success; #endif - struct tcf_proto *tp; struct rcu_work rwork; /* The 'sel' field MUST be the last field in structure to allow for * tc_u32_keys allocated at end of structure. @@ -80,10 +79,10 @@ struct tc_u_hnode { struct tc_u_hnode __rcu *next; u32 handle; u32 prio; - struct tc_u_common *tp_c; int refcnt; unsigned int divisor; struct idr handle_idr; + bool is_root; struct rcu_head rcu; u32 flags; /* The 'ht' field MUST be the last field in structure to allow for @@ -98,7 +97,7 @@ struct tc_u_common { int refcnt; struct idr handle_idr; struct hlist_node hnode; - struct rcu_head rcu; + long knodes; }; static inline unsigned int u32_hash_fold(__be32 key, @@ -344,19 +343,16 @@ static void *tc_u_common_ptr(const struct tcf_proto *tp) return block->q; } -static unsigned int tc_u_hash(const struct tcf_proto *tp) +static struct hlist_head *tc_u_hash(void *key) { - return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT); + return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT); } -static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) +static struct tc_u_common *tc_u_common_find(void *key) { struct tc_u_common *tc; - unsigned int h; - - h = tc_u_hash(tp); - hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) { - if (tc->ptr == tc_u_common_ptr(tp)) + hlist_for_each_entry(tc, tc_u_hash(key), hnode) { + if (tc->ptr == key) return tc; } return NULL; @@ -365,10 +361,8 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) static int u32_init(struct tcf_proto *tp) { struct tc_u_hnode *root_ht; - struct tc_u_common *tp_c; - unsigned int h; - - tp_c = tc_u_common_find(tp); + void *key = tc_u_common_ptr(tp); + struct tc_u_common *tp_c = tc_u_common_find(key); root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL); if (root_ht == NULL) @@ -377,6 +371,7 @@ static int u32_init(struct tcf_proto *tp) root_ht->refcnt++; root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + root_ht->is_root = true; idr_init(&root_ht->handle_idr); if (tp_c == NULL) { @@ -385,26 +380,23 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } - tp_c->ptr = tc_u_common_ptr(tp); + tp_c->ptr = key; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); - h = tc_u_hash(tp); - hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); + hlist_add_head(&tp_c->hnode, tc_u_hash(key)); } tp_c->refcnt++; RCU_INIT_POINTER(root_ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, root_ht); - root_ht->tp_c = tp_c; rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; } -static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, - bool free_pf) +static int u32_destroy_key(struct tc_u_knode *n, bool free_pf) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); @@ -438,7 +430,7 @@ static void u32_delete_key_work(struct work_struct *work) struct tc_u_knode, rwork); rtnl_lock(); - u32_destroy_key(key->tp, key, false); + u32_destroy_key(key, false); rtnl_unlock(); } @@ -455,12 +447,13 @@ static void u32_delete_key_freepf_work(struct work_struct *work) struct tc_u_knode, rwork); rtnl_lock(); - u32_destroy_key(key->tp, key, true); + u32_destroy_key(key, true); rtnl_unlock(); } static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) { + struct tc_u_common *tp_c = tp->data; struct tc_u_knode __rcu **kp; struct tc_u_knode *pkp; struct tc_u_hnode *ht = rtnl_dereference(key->ht_up); @@ -471,6 +464,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) kp = &pkp->next, pkp = rtnl_dereference(*kp)) { if (pkp == key) { RCU_INIT_POINTER(*kp, key->next); + tp_c->knodes--; tcf_unbind_filter(tp, &key->res); idr_remove(&ht->handle_idr, key->handle); @@ -585,6 +579,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct netlink_ext_ack *extack) { + struct tc_u_common *tp_c = tp->data; struct tc_u_knode *n; unsigned int h; @@ -592,13 +587,14 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, while ((n = rtnl_dereference(ht->ht[h])) != NULL) { RCU_INIT_POINTER(ht->ht[h], rtnl_dereference(n->next)); + tp_c->knodes--; tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n, extack); idr_remove(&ht->handle_idr, n->handle); if (tcf_exts_get_net(&n->exts)) tcf_queue_work(&n->rwork, u32_delete_key_freepf_work); else - u32_destroy_key(n->tp, n, true); + u32_destroy_key(n, true); } } } @@ -631,17 +627,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, return -ENOENT; } -static bool ht_empty(struct tc_u_hnode *ht) -{ - unsigned int h; - - for (h = 0; h <= ht->divisor; h++) - if (rcu_access_pointer(ht->ht[h])) - return false; - - return true; -} - static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; @@ -679,20 +664,16 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, struct netlink_ext_ack *extack) { struct tc_u_hnode *ht = arg; - struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); struct tc_u_common *tp_c = tp->data; int ret = 0; - if (ht == NULL) - goto out; - if (TC_U32_KEY(ht->handle)) { u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack); ret = u32_delete_key(tp, (struct tc_u_knode *)ht); goto out; } - if (root_ht == ht) { + if (ht->is_root) { NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node"); return -EINVAL; } @@ -706,38 +687,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, } out: - *last = true; - if (root_ht) { - if (root_ht->refcnt > 1) { - *last = false; - goto ret; - } - if (root_ht->refcnt == 1) { - if (!ht_empty(root_ht)) { - *last = false; - goto ret; - } - } - } - - if (tp_c->refcnt > 1) { - *last = false; - goto ret; - } - - if (tp_c->refcnt == 1) { - struct tc_u_hnode *ht; - - for (ht = rtnl_dereference(tp_c->hlist); - ht; - ht = rtnl_dereference(ht->next)) - if (!ht_empty(ht)) { - *last = false; - break; - } - } - -ret: + *last = tp_c->refcnt == 1 && tp_c->knodes == 0; return ret; } @@ -768,7 +718,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { }; static int u32_set_parms(struct net *net, struct tcf_proto *tp, - unsigned long base, struct tc_u_hnode *ht, + unsigned long base, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) @@ -789,12 +739,16 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, } if (handle) { - ht_down = u32_lookup_ht(ht->tp_c, handle); + ht_down = u32_lookup_ht(tp->data, handle); if (!ht_down) { NL_SET_ERR_MSG_MOD(extack, "Link hash table not found"); return -EINVAL; } + if (ht_down->is_root) { + NL_SET_ERR_MSG_MOD(extack, "Not linking to root node"); + return -EINVAL; + } ht_down->refcnt++; } @@ -891,7 +845,6 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, /* Similarly success statistics must be moved as pointers */ new->pcpu_success = n->pcpu_success; #endif - new->tp = tp; memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { @@ -960,18 +913,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOMEM; - err = u32_set_parms(net, tp, base, - rtnl_dereference(n->ht_up), new, tb, + err = u32_set_parms(net, tp, base, new, tb, tca[TCA_RATE], ovr, extack); if (err) { - u32_destroy_key(tp, new, false); + u32_destroy_key(new, false); return err; } err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { - u32_destroy_key(tp, new, false); + u32_destroy_key(new, false); return err; } @@ -988,7 +940,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_U32_DIVISOR]) { unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); - if (--divisor > 0x100) { + if (!is_power_of_2(divisor)) { + NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2"); + return -EINVAL; + } + if (divisor-- > 0x100) { NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets"); return -EINVAL; } @@ -1013,7 +969,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } } - ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; @@ -1103,7 +1058,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = flags; - n->tp = tp; err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) @@ -1125,7 +1079,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr, + err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr, extack); if (err == 0) { struct tc_u_knode __rcu **ins; @@ -1146,6 +1100,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); + tp_c->knodes++; *arg = n; return 0; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index da1963b19dec..cf5c714ae786 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1671,7 +1671,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) ASSERT_RTNL(); err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, - rtm_tca_policy, NULL); + rtm_tca_policy, cb->extack); if (err < 0) return err; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3023929852e8..de1663f7d3ad 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -572,6 +572,18 @@ struct Qdisc noop_qdisc = { .dev_queue = &noop_netdev_queue, .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), + .gso_skb = { + .next = (struct sk_buff *)&noop_qdisc.gso_skb, + .prev = (struct sk_buff *)&noop_qdisc.gso_skb, + .qlen = 0, + .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock), + }, + .skb_bad_txq = { + .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq, + .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq, + .qlen = 0, + .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), + }, }; EXPORT_SYMBOL(noop_qdisc); @@ -1273,8 +1285,6 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; - __skb_queue_head_init(&qdisc->gso_skb); - __skb_queue_head_init(&qdisc->skb_bad_txq); } void dev_init_scheduler(struct net_device *dev) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 555427b3e0fe..a264cf2accd0 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -32,37 +32,49 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) { unsigned long flags; - if (xs->dev) { - spin_lock_irqsave(&umem->xsk_list_lock, flags); - list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); - - if (umem->zc) - synchronize_net(); - } + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_del_rcu(&xs->list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); } -int xdp_umem_query(struct net_device *dev, u16 queue_id) +/* The umem is stored both in the _rx struct and the _tx struct as we do + * not know if the device has more tx queues than rx, or the opposite. + * This might also change during run time. + */ +static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, + u16 queue_id) { - struct netdev_bpf bpf; + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].umem = umem; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].umem = umem; +} - ASSERT_RTNL(); +struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, + u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + return dev->_rx[queue_id].umem; + if (queue_id < dev->real_num_tx_queues) + return dev->_tx[queue_id].umem; - memset(&bpf, 0, sizeof(bpf)); - bpf.command = XDP_QUERY_XSK_UMEM; - bpf.xsk.queue_id = queue_id; + return NULL; +} - if (!dev->netdev_ops->ndo_bpf) - return 0; - return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; +static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id) +{ + if (queue_id < dev->real_num_rx_queues) + dev->_rx[queue_id].umem = NULL; + if (queue_id < dev->real_num_tx_queues) + dev->_tx[queue_id].umem = NULL; } int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u32 queue_id, u16 flags) + u16 queue_id, u16 flags) { bool force_zc, force_copy; struct netdev_bpf bpf; - int err; + int err = 0; force_zc = flags & XDP_ZEROCOPY; force_copy = flags & XDP_COPY; @@ -70,17 +82,23 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, if (force_zc && force_copy) return -EINVAL; - if (force_copy) - return 0; + rtnl_lock(); + if (xdp_get_umem_from_qid(dev, queue_id)) { + err = -EBUSY; + goto out_rtnl_unlock; + } - if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) - return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ + xdp_reg_umem_at_qid(dev, umem, queue_id); + umem->dev = dev; + umem->queue_id = queue_id; + if (force_copy) + /* For copy-mode, we are done. */ + goto out_rtnl_unlock; - rtnl_lock(); - err = xdp_umem_query(dev, queue_id); - if (err) { - err = err < 0 ? -EOPNOTSUPP : -EBUSY; - goto err_rtnl_unlock; + if (!dev->netdev_ops->ndo_bpf || + !dev->netdev_ops->ndo_xsk_async_xmit) { + err = -EOPNOTSUPP; + goto err_unreg_umem; } bpf.command = XDP_SETUP_XSK_UMEM; @@ -89,18 +107,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, err = dev->netdev_ops->ndo_bpf(dev, &bpf); if (err) - goto err_rtnl_unlock; + goto err_unreg_umem; rtnl_unlock(); dev_hold(dev); - umem->dev = dev; - umem->queue_id = queue_id; umem->zc = true; return 0; -err_rtnl_unlock: +err_unreg_umem: + xdp_clear_umem_at_qid(dev, queue_id); + if (!force_zc) + err = 0; /* fallback to copy mode */ +out_rtnl_unlock: rtnl_unlock(); - return force_zc ? err : 0; /* fail or fallback */ + return err; } static void xdp_umem_clear_dev(struct xdp_umem *umem) @@ -108,7 +128,7 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) struct netdev_bpf bpf; int err; - if (umem->dev) { + if (umem->zc) { bpf.command = XDP_SETUP_XSK_UMEM; bpf.xsk.umem = NULL; bpf.xsk.queue_id = umem->queue_id; @@ -119,9 +139,17 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem) if (err) WARN(1, "failed to disable umem!\n"); + } + + if (umem->dev) { + rtnl_lock(); + xdp_clear_umem_at_qid(umem->dev, umem->queue_id); + rtnl_unlock(); + } + if (umem->zc) { dev_put(umem->dev); - umem->dev = NULL; + umem->zc = false; } } diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index c8be1ad3eb88..27603227601b 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -9,7 +9,7 @@ #include <net/xdp_sock.h> int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, - u32 queue_id, u16 flags); + u16 queue_id, u16 flags); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5a432dfee4ee..0577cd49aa72 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -355,12 +355,18 @@ static int xsk_release(struct socket *sock) local_bh_enable(); if (xs->dev) { + struct net_device *dev = xs->dev; + /* Wait for driver to stop using the xdp socket. */ - synchronize_net(); - dev_put(xs->dev); + xdp_del_sk_umem(xs->umem, xs); xs->dev = NULL; + synchronize_net(); + dev_put(dev); } + xskq_destroy(xs->rx); + xskq_destroy(xs->tx); + sock_orphan(sk); sock->sk = NULL; @@ -419,13 +425,6 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } qid = sxdp->sxdp_queue_id; - - if ((xs->rx && qid >= dev->real_num_rx_queues) || - (xs->tx && qid >= dev->real_num_tx_queues)) { - err = -EINVAL; - goto out_unlock; - } - flags = sxdp->sxdp_flags; if (flags & XDP_SHARED_UMEM) { @@ -721,9 +720,6 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xskq_destroy(xs->rx); - xskq_destroy(xs->tx); - xdp_del_sk_umem(xs->umem, xs); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index df7ca2dabc48..ca7a207b81a9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1007,7 +1007,7 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) int err; err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, - NULL); + cb->extack); if (err < 0) return err; |
