diff options
| author | Alexei Starovoitov <ast@kernel.org> | 2018-09-14 12:04:34 -0700 |
|---|---|---|
| committer | Alexei Starovoitov <ast@kernel.org> | 2018-09-14 12:04:34 -0700 |
| commit | 4a9f42c9dcbfb8da40bfeaa923b6a740a64a889b (patch) | |
| tree | a80dc2e952a61a052a786792e1400190eb526815 /net | |
| parent | 1edb6e035eb72a17462ba275fe2db36c37a62909 (diff) | |
| parent | 50b3ed57dee9cd0e06c59826cec8af14b51bab3e (diff) | |
Merge branch 'bpf-flow-dissector'
Petar Penkov says:
====================
This patch series hardens the RX stack by allowing flow dissection in BPF,
as previously discussed [1]. Because of the rigorous checks of the BPF
verifier, this provides significant security guarantees. In particular, the
BPF flow dissector cannot get inside of an infinite loop, as with
CVE-2013-4348, because BPF programs are guaranteed to terminate. It cannot
read outside of packet bounds, because all memory accesses are checked.
Also, with BPF the administrator can decide which protocols to support,
reducing potential attack surface. Rarely encountered protocols can be
excluded from dissection and the program can be updated without kernel
recompile or reboot if a bug is discovered.
Patch 1 adds infrastructure to execute a BPF program in __skb_flow_dissect.
This includes a new BPF program and attach type.
Patch 2 adds the new BPF flow dissector definitions to tools/uapi.
Patch 3 adds support for the new BPF program type to libbpf and bpftool.
Patch 4 adds a flow dissector program in BPF. This parses most protocols in
__skb_flow_dissect in BPF for a subset of flow keys (basic, control, ports,
and address types).
Patch 5 adds a selftest that attaches the BPF program to the flow dissector
and sends traffic with different levels of encapsulation.
Performance Evaluation:
The in-kernel implementation was compared against the demo program from
patch 4 using the test in patch 5 with IPv4/UDP traffic over 10 seconds.
$perf record -a -C 4 taskset -c 4 ./test_flow_dissector -i 4 -f 8 \
-t 10
In-kernel Dissector:
__skb_flow_dissect overhead: 2.12%
Total Packets: 3,272,597 (from output of ./test_flow_dissector)
BPF Dissector:
__skb_flow_dissect overhead: 1.63%
Total Packets: 3,232,356 (from output of ./test_flow_dissector)
No-op BPF Dissector:
__skb_flow_dissect overhead: 1.52%
Total Packets: 3,330,635 (from output of ./test_flow_dissector)
Changes since v3:
1/ struct bpf_flow_keys reorganized to remove holes in patch 1 and patch 2.
Changes since v2:
1/ Changes to tools/include/uapi pulled into a separate patch 2
2/ Changes to tools/lib and tools/bpftool pulled into a separate patch 3
3/ Changed flow_keys in __sk_buff from __u32 to struct bpf_flow_keys *
4/ Added nhoff field in struct bpf_flow_keys to pass initial offset
5/ Saving all of the modified control block, rather than just the qdisc
6/ Sample BPF program in patch 4 modified to use the changes above
Changes since v1:
1/ LD_ABS instructions now disallowed for the new BPF prog type
2/ now checks if skb is NULL in __skb_flow_dissect()
3/ fixed incorrect accesses in flow_dissector_is_valid_access()
- writes to the flow_keys field now disallowed
- reads/writes to tc_classid and data_meta now disallowed
4/ headers pulled with bpf_skb_load_data if direct access fails
Changes since RFC:
1/ Flow dissector hook changed from global to per-netns
2/ Defined struct bpf_flow_keys to be used in BPF flow dissector
programs instead of exposing the internal flow keys layout. Added a
function to translate from bpf_flow_keys to the internal layout after BPF
dissection is complete. The pointer to this struct is stored in
qdisc_skb_cb rather than inside of the 20 byte control block which
simplifies verification and allows access to all 20 bytes of the cb.
3/ Removed GUE parsing as it relied on a hardcoded port
4/ MPLS parsing now stops at the first label which is consistent
with the in-kernel flow dissector
5/ Refactored to use direct packet access and to write out to
struct bpf_flow_keys
[1] http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf
====================
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'net')
| -rw-r--r-- | net/core/filter.c | 70 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 134 |
2 files changed, 204 insertions, 0 deletions
diff --git a/net/core/filter.c b/net/core/filter.c index bf5b6efd369a..9cc76f134ddb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5124,6 +5124,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { @@ -5241,6 +5252,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + if (size != sizeof(struct bpf_flow_keys *)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5266,6 +5281,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5291,6 +5307,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5501,6 +5518,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5702,6 +5720,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5761,6 +5780,39 @@ static bool sk_msg_is_valid_access(int off, int size, return true; } +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + info->reg_type = PTR_TO_FLOW_KEYS; + break; + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -6055,6 +6107,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; + + case offsetof(struct __sk_buff, flow_keys): + off = si->off; + off -= offsetof(struct __sk_buff, flow_keys); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, flow_keys); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; } return insn - insn_buf; @@ -7018,6 +7079,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { const struct bpf_prog_ops sk_msg_prog_ops = { }; +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ce9eeeb7c024..5c5dd74b5b3b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,6 +25,9 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> +#include <linux/bpf.h> + +static DEFINE_MUTEX(flow_dissector_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (attached) { + /* Only one BPF program can be attached at a time */ + mutex_unlock(&flow_dissector_mutex); + return -EEXIST; + } + rcu_assign_pointer(net->flow_dissector_prog, prog); + mutex_unlock(&flow_dissector_mutex); + return 0; +} + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (!attached) { + mutex_unlock(&flow_dissector_mutex); + return -ENOENT; + } + bpf_prog_put(attached); + RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + mutex_unlock(&flow_dissector_mutex); + return 0; +} /** * skb_flow_get_be16 - extract be16 entity * @skb: sk_buff to extract from @@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + struct bpf_prog *attached; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -658,6 +754,44 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) + : NULL; + if (attached) { + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + struct bpf_flow_keys flow_keys = {}; + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(cb_saved)); + + /* Pass parameters to the BPF program */ + cb->qdisc_cb.flow_keys = &flow_keys; + flow_keys.nhoff = nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(attached, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + key_control->thoff = min_t(u16, key_control->thoff, skb->len); + rcu_read_unlock(); + return result == BPF_OK; + } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); |
