diff options
| -rw-r--r-- | include/linux/netdevice.h | 13 | ||||
| -rw-r--r-- | include/net/netfilter/nf_conntrack_count.h | 17 | ||||
| -rw-r--r-- | include/net/netfilter/nf_flow_table.h | 26 | ||||
| -rw-r--r-- | include/uapi/linux/netfilter/nf_tables.h | 14 | ||||
| -rw-r--r-- | include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 40 | ||||
| -rw-r--r-- | net/ipv4/ipip.c | 25 | ||||
| -rw-r--r-- | net/netfilter/Makefile | 1 | ||||
| -rw-r--r-- | net/netfilter/nf_conncount.c | 211 | ||||
| -rw-r--r-- | net/netfilter/nf_flow_table_core.c | 5 | ||||
| -rw-r--r-- | net/netfilter/nf_flow_table_ip.c | 293 | ||||
| -rw-r--r-- | net/netfilter/nf_flow_table_offload.c | 2 | ||||
| -rw-r--r-- | net/netfilter/nf_flow_table_path.c | 330 | ||||
| -rw-r--r-- | net/netfilter/nft_connlimit.c | 54 | ||||
| -rw-r--r-- | net/netfilter/nft_flow_offload.c | 252 | ||||
| -rw-r--r-- | net/netfilter/xt_connlimit.c | 14 | ||||
| -rw-r--r-- | net/openvswitch/conntrack.c | 16 | ||||
| -rwxr-xr-x | tools/testing/selftests/net/netfilter/nft_flowtable.sh | 126 |
17 files changed, 980 insertions, 459 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e808071dbb7d..bf99fe8622da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -877,6 +877,7 @@ enum net_device_path_type { DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, + DEV_PATH_TUN, }; struct net_device_path { @@ -889,6 +890,18 @@ struct net_device_path { u8 h_dest[ETH_ALEN]; } encap; struct { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; + } tun; + struct { enum { DEV_PATH_BR_VLAN_KEEP, DEV_PATH_BR_VLAN_TAG, diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 1b58b5b91ff6..52a06de41aa0 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -18,15 +18,14 @@ struct nf_conncount_list { struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - -int nf_conncount_add(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key); + +int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, + u16 l3num, struct nf_conncount_list *list); void nf_conncount_list_init(struct nf_conncount_list *list); diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index c003cd194fa2..b09c11c048d5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -107,6 +107,19 @@ enum flow_offload_xmit_type { #define NF_FLOW_TABLE_ENCAP_MAX 2 +struct flow_offload_tunnel { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; +}; + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -130,22 +143,25 @@ struct flow_offload_tuple { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; + /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; u8 dir:2, xmit_type:3, encap_num:2, + tun_num:2, in_vlan_ingress:2; u16 mtu; union { struct { struct dst_entry *dst_cache; + u32 ifidx; u32 dst_cookie; }; struct { u32 ifidx; - u32 hw_ifidx; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; @@ -206,7 +222,9 @@ struct nf_flow_route { u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; u8 num_encaps:2, + num_tuns:2, ingress_vlans:2; } in; struct { @@ -222,6 +240,12 @@ struct nf_flow_route { struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +struct nft_flowtable; +struct nft_pktinfo; +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft); + static inline int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, flow_setup_cb_t *cb, void *cb_priv) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7c0c915f0306..45c71f7d21c2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -881,7 +881,7 @@ enum nft_exthdr_flags { * enum nft_exthdr_op - nf_tables match options * * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers - * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_TCPOPT: match against tcp options * @NFT_EXTHDR_OP_IPV4: match against ipv4 options * @NFT_EXTHDR_OP_SCTP: match against sctp chunks * @NFT_EXTHDR_OP_DCCP: match against dccp otions @@ -1200,7 +1200,7 @@ enum nft_ct_attributes { #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) /** - * enum nft_flow_attributes - ct offload expression attributes + * enum nft_offload_attributes - ct offload expression attributes * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) */ enum nft_offload_attributes { @@ -1410,7 +1410,7 @@ enum nft_reject_types { }; /** - * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * enum nft_reject_inet_code - Generic reject codes for IPv4/IPv6 * * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable @@ -1480,9 +1480,9 @@ enum nft_nat_attributes { /** * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes * - * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) - * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) - * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) + * @NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) + * @NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) + * @NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) */ enum nft_tproxy_attributes { NFTA_TPROXY_UNSPEC, @@ -1783,7 +1783,7 @@ enum nft_synproxy_attributes { #define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) /** - * enum nft_device_attributes - nf_tables device netlink attributes + * enum nft_devices_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING) diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h index 54ed83360dac..80c66c8ece82 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -41,13 +41,13 @@ /** * struct ip6t_srh - SRH match options - * @ next_hdr: Next header field of SRH - * @ hdr_len: Extension header length field of SRH - * @ segs_left: Segments left field of SRH - * @ last_entry: Last entry field of SRH - * @ tag: Tag field of SRH - * @ mt_flags: match options - * @ mt_invflags: Invert the sense of match options + * @next_hdr: Next header field of SRH + * @hdr_len: Extension header length field of SRH + * @segs_left: Segments left field of SRH + * @last_entry: Last entry field of SRH + * @tag: Tag field of SRH + * @mt_flags: match options + * @mt_invflags: Invert the sense of match options */ struct ip6t_srh { @@ -62,19 +62,19 @@ struct ip6t_srh { /** * struct ip6t_srh1 - SRH match options (revision 1) - * @ next_hdr: Next header field of SRH - * @ hdr_len: Extension header length field of SRH - * @ segs_left: Segments left field of SRH - * @ last_entry: Last entry field of SRH - * @ tag: Tag field of SRH - * @ psid_addr: Address of previous SID in SRH SID list - * @ nsid_addr: Address of NEXT SID in SRH SID list - * @ lsid_addr: Address of LAST SID in SRH SID list - * @ psid_msk: Mask of previous SID in SRH SID list - * @ nsid_msk: Mask of next SID in SRH SID list - * @ lsid_msk: MAsk of last SID in SRH SID list - * @ mt_flags: match options - * @ mt_invflags: Invert the sense of match options + * @next_hdr: Next header field of SRH + * @hdr_len: Extension header length field of SRH + * @segs_left: Segments left field of SRH + * @last_entry: Last entry field of SRH + * @tag: Tag field of SRH + * @psid_addr: Address of previous SID in SRH SID list + * @nsid_addr: Address of NEXT SID in SRH SID list + * @lsid_addr: Address of LAST SID in SRH SID list + * @psid_msk: Mask of previous SID in SRH SID list + * @nsid_msk: Mask of next SID in SRH SID list + * @lsid_msk: MAsk of last SID in SRH SID list + * @mt_flags: match options + * @mt_invflags: Invert the sense of match options */ struct ip6t_srh1 { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1c..ff95b1b9908e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); } +static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); + const struct iphdr *tiph = &tunnel->parms.iph; + struct rtable *rt; + + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, + RT_SCOPE_UNIVERSE); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + path->type = DEV_PATH_TUN; + path->tun.src_v4.s_addr = tiph->saddr; + path->tun.dst_v4.s_addr = tiph->daddr; + path->tun.l3_proto = IPPROTO_IPIP; + path->dev = ctx->dev; + + ctx->dev = rt->dst.dev; + ip_rt_put(rt); + + return 0; +} + static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, + .ndo_fill_forward_path = ipip_fill_forward_path, }; #define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e43e20f529f8..6bfc250e474f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_path.o \ nf_flow_table_offload.o nf_flow_table_xdp.o nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o ifeq ($(CONFIG_NF_FLOW_TABLE),m) diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 913ede2f57f9..f1be4dd5cf85 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -122,15 +122,65 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } +static bool get_ct_or_tuple_from_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conn **ct, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone **zone, + bool *refcounted) +{ + const struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *found_ct; + + found_ct = nf_ct_get(skb, &ctinfo); + if (found_ct && !nf_ct_is_template(found_ct)) { + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + *zone = nf_ct_zone(found_ct); + *ct = found_ct; + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) + return false; + + if (found_ct) + *zone = nf_ct_zone(found_ct); + + h = nf_conntrack_find_get(net, *zone, tuple); + if (!h) + return true; + + found_ct = nf_ct_tuplehash_to_ctrack(h); + *refcounted = true; + *ct = found_ct; + + return true; +} + static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct = NULL; struct nf_conn *found_ct; unsigned int collect = 0; + bool refcounted = false; + + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) + return -ENOENT; + + if (ct && nf_ct_is_confirmed(ct)) { + if (refcounted) + nf_ct_put(ct); + return -EEXIST; + } if ((u32)jiffies == list->last_gc) goto add_new_node; @@ -144,10 +194,10 @@ static int __nf_conncount_add(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ + goto out_put; /* already exists */ } else { collect++; } @@ -156,7 +206,7 @@ static int __nf_conncount_add(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -165,7 +215,7 @@ static int __nf_conncount_add(struct net *net, * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); - return 0; + goto out_put; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -188,31 +238,35 @@ add_new_node: if (conn == NULL) return -ENOMEM; - conn->tuple = *tuple; + conn->tuple = tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; list->last_gc = (u32)jiffies; + +out_put: + if (refcounted) + nf_ct_put(ct); return 0; } -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +int nf_conncount_add_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, list); spin_unlock_bh(&list->list_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conncount_add); +EXPORT_SYMBOL_GPL(nf_conncount_add_skb); void nf_conncount_list_init(struct nf_conncount_list *list) { @@ -224,8 +278,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list) EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ -bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +static bool __nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -237,10 +291,6 @@ bool nf_conncount_gc_list(struct net *net, if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) - return false; - list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { @@ -269,7 +319,21 @@ bool nf_conncount_gc_list(struct net *net, if (!list->count) ret = true; list->last_gc = (u32)jiffies; - spin_unlock(&list->list_lock); + + return ret; +} + +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + bool ret; + + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock_bh(&list->list_lock)) + return false; + + ret = __nf_conncount_gc_list(net, list); + spin_unlock_bh(&list->list_lock); return ret; } @@ -309,19 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree) static unsigned int insert_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + bool do_gc = true, refcounted = false; + unsigned int count = 0, gc_count = 0; struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; + struct nf_conntrack_tuple tuple; struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - bool do_gc = true; + struct nf_conncount_rb *rbconn; + struct nf_conn *ct = NULL; spin_lock_bh(&nf_conncount_locks[hash]); restart: @@ -340,8 +407,8 @@ restart: } else { int ret; - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); - if (ret) + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); + if (ret && ret != -EEXIST) count = 0; /* hotdrop */ else count = rbconn->list.count; @@ -364,30 +431,35 @@ restart: goto restart; } - /* expected case: match, insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + goto out_unlock; - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + goto out_unlock; + } - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); + conn->tuple = tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + rbconn->list.count = count; - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + + if (refcounted) + nf_ct_put(ct); + } out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; @@ -395,10 +467,10 @@ out_unlock: static unsigned int count_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct rb_root *root; struct rb_node *parent; @@ -422,7 +494,7 @@ count_tree(struct net *net, } else { int ret; - if (!tuple) { + if (!skb) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } @@ -437,19 +509,23 @@ count_tree(struct net *net, } /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); - if (ret) + if (ret && ret != -EEXIST) { return 0; /* hotdrop */ - else + } else { + /* -EEXIST means add was skipped, update the list */ + if (ret == -EEXIST) + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } } } - if (!tuple) + if (!skb) return 0; - return insert_tree(net, data, root, hash, key, tuple, zone); + return insert_tree(net, skb, l3num, data, root, hash, key); } static void tree_gc_worker(struct work_struct *work) @@ -511,18 +587,19 @@ next: } /* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. + * If 'skb' is not null, insert the corresponding tuple into the accounting + * data structure. Call with RCU read lock. */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key) { - return count_tree(net, data, key, tuple, zone); + return count_tree(net, skb, l3num, data, key); + } -EXPORT_SYMBOL_GPL(nf_conncount_count); +EXPORT_SYMBOL_GPL(nf_conncount_count_skb); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9441ac3d8c1a..06e8251a6644 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -118,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->in_vlan_ingress |= BIT(j); j++; } + + flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: @@ -127,11 +130,11 @@ static int flow_offload_fill_route(struct flow_offload *flow, memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: + flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae211..78883343e5d6 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -145,8 +145,11 @@ static bool ip_has_options(unsigned int thoff) static void nf_flow_tuple_encap(struct sk_buff *skb, struct flow_offload_tuple *tuple) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct iphdr *iph; + u16 offset = 0; int i = 0; if (skb_vlan_tag_present(skb)) { @@ -159,13 +162,26 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, veth = (struct vlan_ethhdr *)skb_mac_header(skb); tuple->encap[i].id = ntohs(veth->h_vlan_TCI); tuple->encap[i].proto = skb->protocol; + inner_proto = veth->h_vlan_encapsulated_proto; + offset += VLAN_HLEN; break; case htons(ETH_P_PPP_SES): phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; + inner_proto = *((__be16 *)(phdr + 1)); + offset += PPPOE_SES_HLEN; break; } + + if (inner_proto == htons(ETH_P_IP)) { + iph = (struct iphdr *)(skb_network_header(skb) + offset); + if (iph->protocol == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + } } struct nf_flowtable_ctx { @@ -277,11 +293,46 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) +{ + struct iphdr *iph; + u16 size; + + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + return false; + + iph = (struct iphdr *)(skb_network_header(skb) + *psize); + size = iph->ihl << 2; + + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) + return false; + + if (iph->ttl <= 1) + return false; + + if (iph->protocol == IPPROTO_IPIP) + *psize += size; + + return true; +} + +static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + + if (iph->protocol != IPPROTO_IPIP) + return; + + skb_pull(skb, iph->ihl << 2); + skb_reset_network_header(skb); +} + static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; - __be16 inner_proto; + bool ret = false; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -291,19 +342,23 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; - return true; + inner_proto = proto; + ret = true; } break; case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { *offset += PPPOE_SES_HLEN; - return true; + ret = true; } break; } - return false; + if (inner_proto == htons(ETH_P_IP)) + ret = nf_flow_ip4_tunnel_proto(skb, offset); + + return ret; } static void nf_flow_encap_pop(struct sk_buff *skb, @@ -331,21 +386,23 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } } + + if (skb->protocol == htons(ETH_P_IP)) + nf_flow_ip4_tunnel_pop(skb); } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; +}; + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - const struct flow_offload_tuple_rhash *tuplehash, - unsigned short type) + struct nf_flow_xmit *xmit) { - struct net_device *outdev; - - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); - if (!outdev) - return NF_DROP; - - skb->dev = outdev; - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, - tuplehash->tuple.out.h_source, skb->len); + skb->dev = xmit->outdev; + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); return NF_STOLEN; @@ -357,8 +414,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL; if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) @@ -381,6 +437,9 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*iph); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -414,20 +473,139 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +{ + int data_len = skb->len + sizeof(__be16); + struct ppp_hdr { + struct pppoe_hdr hdr; + __be16 proto; + } *ph; + __be16 proto; + + if (skb_cow_head(skb, PPPOE_SES_HLEN)) + return -1; + + switch (skb->protocol) { + case htons(ETH_P_IP): + proto = htons(PPP_IP); + break; + case htons(ETH_P_IPV6): + proto = htons(PPP_IPV6); + break; + default: + return -1; + } + + __skb_push(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + + ph = (struct ppp_hdr *)(skb->data); + ph->hdr.ver = 1; + ph->hdr.type = 1; + ph->hdr.code = 0; + ph->hdr.sid = htons(id); + ph->hdr.length = htons(data_len); + ph->proto = proto; + skb->protocol = htons(ETH_P_PPP_SES); + + return 0; +} + +static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + struct rtable *rt = dst_rtable(tuple->dst_cache); + u8 tos = iph->tos, ttl = iph->ttl; + __be16 frag_off = iph->frag_off; + u32 headroom = sizeof(*iph); + int err; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); + if (err) + return err; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + skb_clear_hash_if_not_l4(skb); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; + iph->protocol = tuple->tun.l3_proto; + iph->tos = tos; + iph->daddr = tuple->tun.src_v4.s_addr; + iph->saddr = tuple->tun.dst_v4.s_addr; + iph->ttl = ttl; + iph->tot_len = htons(skb->len); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + ip_send_check(iph); + + *ip_daddr = tuple->tun.src_v4.s_addr; + + return 0; +} + +static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); + + return 0; +} + +static int nf_flow_encap_push(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i; + + for (i = 0; i < tuple->encap_num; i++) { + switch (tuple->encap[i].proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + if (skb_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id) < 0) + return -1; + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + return -1; + break; + } + } + + return 0; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; + struct nf_flow_xmit xmit = {}; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rtable *rt; - __be32 nexthop; + __be32 ip_daddr; int ret; tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); @@ -450,29 +628,46 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip_daddr = other_tuple->src_v4.s_addr; + + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) + return NF_DROP; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); @@ -715,13 +910,15 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; - const struct in6_addr *nexthop; + struct nf_flow_xmit xmit = {}; + struct in6_addr *ip6_daddr; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rt6_info *rt; int ret; @@ -745,28 +942,42 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip6_daddr = &other_tuple->src_v6; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e06bc36f49fe..d8f7bfd60ac6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -555,7 +555,7 @@ static void flow_offload_redirect(struct net *net, switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; - ifindex = this_tuple->out.hw_ifidx; + ifindex = this_tuple->out.ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c new file mode 100644 index 000000000000..f0984cf69a09 --- /dev/null +++ b/net/netfilter/nf_flow_table_path.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <linux/netfilter/nf_tables.h> +#include <net/ip.h> +#include <net/inet_dscp.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_flow_table.h> + +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + if (!nft_is_valid_ether_device(dev)) + goto out; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + +out: + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + struct flow_offload_tunnel tun; + u8 num_tuns; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + case DEV_PATH_TUN: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ + if (path->type == DEV_PATH_TUN) { + if (info->num_tuns) { + info->indev = NULL; + break; + } + info->tun.src_v6 = path->tun.src_v6; + info->tun.dst_v6 = path->tun.dst_v6; + info->tun.l3_proto = path->tun.l3_proto; + info->num_tuns++; + } else { + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = + path->encap.id; + info->encap[info->num_encaps].proto = + path->encap.proto; + info->num_encaps++; + } + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { + info->indev = NULL; + break; + } + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + info->outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (!nft_hook_find_ops_rcu(hook, dev)) + continue; + + found = true; + break; + } + + return found; +} + +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, + struct flow_offload_tunnel *tun, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *cur_dst = route->tuple[dir].dst; + struct dst_entry *tun_dst = NULL; + struct flowi fl = {}; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = tun->dst_v4.s_addr; + fl.u.ip4.saddr = tun->src_v4.s_addr; + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = tun->dst_v6; + fl.u.ip6.saddr = tun->src_v6; + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); + if (!tun_dst) + return -ENOENT; + + route->tuple[dir].dst = tun_dst; + dst_release(cur_dst); + + return 0; +} + +static void nft_dev_forward_path(const struct nft_pktinfo *pkt, + struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + + if (info.num_tuns && + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; + route->tuple[!dir].in.num_tuns = info.num_tuns; + } + + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + route->tuple[dir].out.ifindex = info.outdev->ifindex; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].xmit_type = info.xmit_type; + } +} + +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + if (!dst_hold_safe(this_dst)) + return -ENOENT; + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) { + dst_release(this_dst); + return -ENOENT; + } + + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, dir, ft); + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, !dir, ft); + + return 0; +} +EXPORT_SYMBOL_GPL(nft_flow_route); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index fc35a11cdca2..657764774a2d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, const struct nft_pktinfo *pkt, const struct nft_set_ext *ext) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - const struct nf_conntrack_tuple *tuple_ptr; - struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; unsigned int count; + int err; - tuple_ptr = &tuple; - - ct = nf_ct_get(pkt->skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), - nft_pf(pkt), nft_net(pkt), &tuple)) { - regs->verdict.code = NF_DROP; - return; - } - - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { - regs->verdict.code = NF_DROP; - return; + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); + if (err) { + if (err == -EEXIST) { + /* Call gc to update the list count if any connection has + * been closed already. This is useful for softlimit + * connections like limiting bandwidth based on a number + * of open connections. + */ + nf_conncount_gc_list(nft_net(pkt), priv->list); + } else { + regs->verdict.code = NF_DROP; + return; + } } count = READ_ONCE(priv->list->count); - if ((count > priv->limit) ^ priv->invert) { + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { regs->verdict.code = NFT_BREAK; return; } @@ -137,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx, return nft_connlimit_do_init(ctx, tb, priv); } +static void nft_connlimit_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_connlimit *newpriv = nft_obj_data(newobj); + struct nft_connlimit *priv = nft_obj_data(obj); + + WRITE_ONCE(priv->limit, newpriv->limit); + WRITE_ONCE(priv->invert, newpriv->invert); +} + static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { @@ -166,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = { .init = nft_connlimit_obj_init, .destroy = nft_connlimit_obj_destroy, .dump = nft_connlimit_obj_dump, + .update = nft_connlimit_obj_update, }; static struct nft_object_type nft_connlimit_obj_type __read_mostly = { @@ -238,13 +243,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - - local_bh_disable(); - ret = nf_conncount_gc_list(net, priv->list); - local_bh_enable(); - return ret; + return nf_conncount_gc_list(net, priv->list); } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 14dd1c0698c3..b8f76c9057fd 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -20,258 +20,6 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; -static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) -{ - if (dst_xfrm(dst)) - return FLOW_OFFLOAD_XMIT_XFRM; - - return FLOW_OFFLOAD_XMIT_NEIGH; -} - -static void nft_default_forward_path(struct nf_flow_route *route, - struct dst_entry *dst_cache, - enum ip_conntrack_dir dir) -{ - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; - route->tuple[dir].dst = dst_cache; - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); -} - -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - -static int nft_dev_fill_forward_path(const struct nf_flow_route *route, - const struct dst_entry *dst_cache, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, u8 *ha, - struct net_device_path_stack *stack) -{ - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; - struct net_device *dev = dst_cache->dev; - struct neighbour *n; - u8 nud_state; - - if (!nft_is_valid_ether_device(dev)) - goto out; - - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -1; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); - neigh_release(n); - - if (!(nud_state & NUD_VALID)) - return -1; - -out: - return dev_fill_forward_path(dev, ha, stack); -} - -struct nft_forward_info { - const struct net_device *indev; - const struct net_device *outdev; - const struct net_device *hw_outdev; - struct id { - __u16 id; - __be16 proto; - } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u8 num_encaps; - u8 ingress_vlans; - u8 h_source[ETH_ALEN]; - u8 h_dest[ETH_ALEN]; - enum flow_offload_xmit_type xmit_type; -}; - -static void nft_dev_path_info(const struct net_device_path_stack *stack, - struct nft_forward_info *info, - unsigned char *ha, struct nf_flowtable *flowtable) -{ - const struct net_device_path *path; - int i; - - memcpy(info->h_dest, ha, ETH_ALEN); - - for (i = 0; i < stack->num_paths; i++) { - path = &stack->path[i]; - switch (path->type) { - case DEV_PATH_ETHERNET: - case DEV_PATH_DSA: - case DEV_PATH_VLAN: - case DEV_PATH_PPPOE: - info->indev = path->dev; - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - if (path->type == DEV_PATH_ETHERNET) - break; - if (path->type == DEV_PATH_DSA) { - i = stack->num_paths; - break; - } - - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { - info->indev = NULL; - break; - } - if (!info->outdev) - info->outdev = path->dev; - info->encap[info->num_encaps].id = path->encap.id; - info->encap[info->num_encaps].proto = path->encap.proto; - info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); - break; - case DEV_PATH_BRIDGE: - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - switch (path->bridge.vlan_mode) { - case DEV_PATH_BR_VLAN_UNTAG_HW: - info->ingress_vlans |= BIT(info->num_encaps - 1); - break; - case DEV_PATH_BR_VLAN_TAG: - info->encap[info->num_encaps].id = path->bridge.vlan_id; - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; - info->num_encaps++; - break; - case DEV_PATH_BR_VLAN_UNTAG: - info->num_encaps--; - break; - case DEV_PATH_BR_VLAN_KEEP: - break; - } - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; - break; - default: - info->indev = NULL; - break; - } - } - if (!info->outdev) - info->outdev = info->indev; - - info->hw_outdev = info->indev; - - if (nf_flowtable_hw_offload(flowtable) && - nft_is_valid_ether_device(info->indev)) - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; -} - -static bool nft_flowtable_find_dev(const struct net_device *dev, - struct nft_flowtable *ft) -{ - struct nft_hook *hook; - bool found = false; - - list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (!nft_hook_find_ops_rcu(hook, dev)) - continue; - - found = true; - break; - } - - return found; -} - -static void nft_dev_forward_path(struct nf_flow_route *route, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - const struct dst_entry *dst = route->tuple[dir].dst; - struct net_device_path_stack stack; - struct nft_forward_info info = {}; - unsigned char ha[ETH_ALEN]; - int i; - - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) - nft_dev_path_info(&stack, &info, ha, &ft->data); - - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) - return; - - route->tuple[!dir].in.ifindex = info.indev->ifindex; - for (i = 0; i < info.num_encaps; i++) { - route->tuple[!dir].in.encap[i].id = info.encap[i].id; - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; - } - route->tuple[!dir].in.num_encaps = info.num_encaps; - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; - - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.ifindex = info.outdev->ifindex; - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; - route->tuple[dir].xmit_type = info.xmit_type; - } -} - -static int nft_flow_route(const struct nft_pktinfo *pkt, - const struct nf_conn *ct, - struct nf_flow_route *route, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - struct dst_entry *this_dst = skb_dst(pkt->skb); - struct dst_entry *other_dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); - fl.u.ip4.flowi4_mark = pkt->skb->mark; - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; - break; - case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); - fl.u.ip6.flowi6_mark = pkt->skb->mark; - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; - break; - } - - if (!dst_hold_safe(this_dst)) - return -ENOENT; - - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) { - dst_release(this_dst); - return -ENOENT; - } - - nft_default_forward_path(route, this_dst, dir); - nft_default_forward_path(route, other_dst, !dir); - - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { - nft_dev_forward_path(route, ct, dir, ft); - nft_dev_forward_path(route, ct, !dir, ft); - } - - return 0; -} - static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 0189f8b6b0bd..848287ab79cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - struct nf_conntrack_tuple tuple; - const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) u32 key[5]; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if (ct) zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - xt_family(par), net, &tuple)) { - goto hotdrop; - } if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, tuple_ptr, - zone); + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) - /* kmalloc failed, drop it entirely */ + /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e573e9221302..a0811e1fba65 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -928,8 +928,8 @@ static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) } static int ovs_ct_check_limit(struct net *net, - const struct ovs_conntrack_info *info, - const struct nf_conntrack_tuple *tuple) + const struct sk_buff *skb, + const struct ovs_conntrack_info *info) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; @@ -942,8 +942,9 @@ static int ovs_ct_check_limit(struct net *net, if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) return 0; - connections = nf_conncount_count(net, ct_limit_info->data, - &conncount_key, tuple, &info->zone); + connections = nf_conncount_count_skb(net, skb, info->family, + ct_limit_info->data, + &conncount_key); if (connections > per_zone_limit) return -ENOMEM; @@ -972,8 +973,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) if (static_branch_unlikely(&ovs_ct_limit_enabled)) { if (!nf_ct_is_confirmed(ct)) { - err = ovs_ct_check_limit(net, info, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + err = ovs_ct_check_limit(net, skb, info); if (err) { net_warn_ratelimited("openvswitch: zone: %u " "exceeds conntrack limit\n", @@ -1770,8 +1770,8 @@ static int __ovs_ct_limit_get_zone_limit(struct net *net, zone_limit.limit = limit; nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, - &ct_zone); + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, + &conncount_key); return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); } diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 45832df98295..a68bc882fa4e 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -127,6 +127,8 @@ ip -net "$nsr1" addr add fee1:2::1/64 dev veth1 nodad ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad +ip netns exec "$nsr1" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null for i in 0 1; do ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null @@ -153,7 +155,9 @@ ip -net "$ns1" route add default via dead:1::1 ip -net "$ns2" route add default via dead:2::1 ip -net "$nsr1" route add default via 192.168.10.2 +ip -6 -net "$nsr1" route add default via fee1:2::2 ip -net "$nsr2" route add default via 192.168.10.1 +ip -6 -net "$nsr2" route add default via fee1:2::1 ip netns exec "$nsr1" nft -f - <<EOF table inet filter { @@ -352,8 +356,9 @@ test_tcp_forwarding_ip() local nsa=$1 local nsb=$2 local pmtu=$3 - local dstip=$4 - local dstport=$5 + local proto=$4 + local dstip=$5 + local dstport=$6 local lret=0 local socatc local socatl @@ -363,12 +368,14 @@ test_tcp_forwarding_ip() infile="$nsin_small" fi - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -${proto} \ + TCP"${proto}"-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & lpid=$! busywait 1000 listener_ready - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -${proto} \ + TCP"${proto}":"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" socatc=$? wait $lpid @@ -394,8 +401,11 @@ test_tcp_forwarding_ip() test_tcp_forwarding() { local pmtu="$3" + local proto="$4" + local dstip="$5" + local dstport="$6" - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" return $? } @@ -403,6 +413,9 @@ test_tcp_forwarding() test_tcp_forwarding_set_dscp() { local pmtu="$3" + local proto="$4" + local dstip="$5" + local dstport="$6" ip netns exec "$nsr1" nft -f - <<EOF table netdev dscpmangle { @@ -413,7 +426,7 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" check_dscp "dscp_ingress" "$pmtu" ip netns exec "$nsr1" nft delete table netdev dscpmangle @@ -430,7 +443,7 @@ table netdev dscpmangle { } EOF if [ $? -eq 0 ]; then - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" check_dscp "dscp_egress" "$pmtu" ip netns exec "$nsr1" nft delete table netdev dscpmangle @@ -441,7 +454,7 @@ fi # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 # counters should have seen packets (before and after ft offload kicks in). ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" check_dscp "dscp_fwd" "$pmtu" } @@ -455,7 +468,7 @@ test_tcp_forwarding_nat() [ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)" - test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345 + test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 4 10.0.2.99 12345 lret=$? if [ "$lret" -eq 0 ] ; then @@ -465,7 +478,7 @@ test_tcp_forwarding_nat() echo "PASS: flow offload for ns1/ns2 with masquerade $what" fi - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 4 10.6.6.6 1666 lret=$? if [ "$pmtu" -eq 1 ] ;then check_counters "flow offload for ns1/ns2 with dnat $what" @@ -487,7 +500,7 @@ make_file "$nsin_small" "$filesize_small" # Due to MTU mismatch in both directions, all packets (except small packets like pure # acks) have to be handled by normal forwarding path. Therefore, packet counters # are not checked. -if test_tcp_forwarding "$ns1" "$ns2" 0; then +if test_tcp_forwarding "$ns1" "$ns2" 0 4 10.0.2.99 12345; then echo "PASS: flow offloaded for ns1/ns2" else echo "FAIL: flow offload for ns1/ns2:" 1>&2 @@ -495,6 +508,14 @@ else ret=1 fi +if test_tcp_forwarding "$ns1" "$ns2" 0 6 "[dead:2::99]" 12345; then + echo "PASS: IPv6 flow offloaded for ns1/ns2" +else + echo "FAIL: IPv6 flow offload for ns1/ns2:" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # delete default route, i.e. ns2 won't be able to reach ns1 and # will depend on ns1 being masqueraded in nsr1. # expect ns1 has nsr1 address. @@ -520,7 +541,7 @@ table ip nat { EOF check_dscp "dscp_none" "0" -if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 4 10.0.2.99 12345; then echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 exit 0 fi @@ -546,7 +567,7 @@ ip netns exec "$ns2" sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null ip netns exec "$ns2" nft reset counters table inet filter >/dev/null -if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then +if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 4 10.0.2.99 12345; then echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 exit 0 fi @@ -558,6 +579,73 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then ip netns exec "$nsr1" nft list ruleset fi +# IPIP tunnel test: +# Add IPIP tunnel interfaces and check flowtable acceleration. +test_ipip() { +if ! ip -net "$nsr1" link add name tun0 type ipip \ + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then + echo "SKIP: could not add ipip tunnel" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" link set tun0 up +ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 +ip -net "$nsr2" link set tun0 up +ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr1" route change default via 192.168.100.2 +ip -net "$nsr2" route change default via 192.168.100.1 +ip -net "$ns2" route add default via 10.0.2.1 + +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward \ + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Create vlan tagged devices for IPIP traffic. +ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 +ip -net "$nsr1" link set veth1.10 up +ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' +ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun1 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 +ip -net "$nsr1" route change default via 192.168.200.2 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' + +ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 +ip -net "$nsr2" link set veth0.10 up +ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null +ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun1 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 +ip -net "$nsr2" route change default via 192.168.200.1 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Restore the previous configuration +ip -net "$nsr1" route change default via 192.168.10.2 +ip -net "$nsr2" route change default via 192.168.10.1 +ip -net "$ns2" route del default via 10.0.2.1 +} + # Another test: # Add bridge interface br0 to Router1, with NAT enabled. test_bridge() { @@ -643,6 +731,8 @@ ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad ip -net "$nsr1" link set up dev veth0 } +test_ipip + test_bridge KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) @@ -683,7 +773,7 @@ ip -net "$ns2" route del 192.168.10.1 via 10.0.2.1 ip -net "$ns2" route add default via 10.0.2.1 ip -net "$ns2" route add default via dead:2::1 -if test_tcp_forwarding "$ns1" "$ns2" 1; then +if test_tcp_forwarding "$ns1" "$ns2" 1 4 10.0.2.99 12345; then check_counters "ipsec tunnel mode for ns1/ns2" else echo "FAIL: ipsec tunnel mode for ns1/ns2" @@ -691,6 +781,14 @@ else ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 fi +if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + check_counters "IPv6 ipsec tunnel mode for ns1/ns2" +else + echo "FAIL: IPv6 ipsec tunnel mode for ns1/ns2" + ip netns exec "$nsr1" nft list ruleset 1>&2 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 +fi + if [ "$1" = "" ]; then low=1280 mtu=$((65536 - low)) |
