summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/core/filter.c416
-rw-r--r--net/ipv4/tcp.c16
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c127
-rw-r--r--net/ipv4/tcp_ipv4.c5
-rw-r--r--net/ipv4/tcp_minisocks.c1
-rw-r--r--net/ipv4/tcp_output.c193
-rw-r--r--net/ipv6/tcp_ipv6.c5
8 files changed, 717 insertions, 48 deletions
diff --git a/net/core/filter.c b/net/core/filter.c
index b2df52086445..47eef9a0be6a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4459,6 +4459,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
} else {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
if (optlen != sizeof(int))
return -EINVAL;
@@ -4480,6 +4481,20 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
tp->snd_ssthresh = val;
}
break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
case TCP_SAVE_SYN:
if (val < 0 || val > 1)
ret = -EINVAL;
@@ -4550,9 +4565,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
tp = tcp_sk(sk);
if (optlen <= 0 || !tp->saved_syn ||
- optlen > tp->saved_syn[0])
+ optlen > tcp_saved_syn_len(tp->saved_syn))
goto err_clear;
- memcpy(optval, tp->saved_syn + 1, optlen);
+ memcpy(optval, tp->saved_syn->data, optlen);
break;
default:
goto err_clear;
@@ -4654,9 +4669,99 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
+ int optname, const u8 **start)
+{
+ struct sk_buff *syn_skb = bpf_sock->syn_skb;
+ const u8 *hdr_start;
+ int ret;
+
+ if (syn_skb) {
+ /* sk is a request_sock here */
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = syn_skb->data;
+ ret = tcp_hdrlen(syn_skb);
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = skb_network_header(syn_skb);
+ ret = skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+ hdr_start = skb_mac_header(syn_skb);
+ ret = skb_mac_header_len(syn_skb) +
+ skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
+ }
+ } else {
+ struct sock *sk = bpf_sock->sk;
+ struct saved_syn *saved_syn;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ /* synack retransmit. bpf_sock->syn_skb will
+ * not be available. It has to resort to
+ * saved_syn (if it is saved).
+ */
+ saved_syn = inet_reqsk(sk)->saved_syn;
+ else
+ saved_syn = tcp_sk(sk)->saved_syn;
+
+ if (!saved_syn)
+ return -ENOENT;
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen;
+ ret = saved_syn->tcp_hdrlen;
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen;
+ ret = saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+
+ /* TCP_SAVE_SYN may not have saved the mac hdr */
+ if (!saved_syn->mac_hdrlen)
+ return -ENOENT;
+
+ hdr_start = saved_syn->data;
+ ret = saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ }
+ }
+
+ *start = hdr_start;
+ return ret;
+}
+
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
+ optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
+ int ret, copy_len = 0;
+ const u8 *start;
+
+ ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
+ if (ret > 0) {
+ copy_len = ret;
+ if (optlen < copy_len) {
+ copy_len = optlen;
+ ret = -ENOSPC;
+ }
+
+ memcpy(optval, start, copy_len);
+ }
+
+ /* Zero out unused buffer at the end */
+ memset(optval + copy_len, 0, optlen - copy_len);
+
+ return ret;
+ }
+
return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -6150,6 +6255,232 @@ static const struct bpf_func_proto bpf_sk_assign_proto = {
.arg3_type = ARG_ANYTHING,
};
+static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
+ u8 search_kind, const u8 *magic,
+ u8 magic_len, bool *eol)
+{
+ u8 kind, kind_len;
+
+ *eol = false;
+
+ while (op < opend) {
+ kind = op[0];
+
+ if (kind == TCPOPT_EOL) {
+ *eol = true;
+ return ERR_PTR(-ENOMSG);
+ } else if (kind == TCPOPT_NOP) {
+ op++;
+ continue;
+ }
+
+ if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
+ /* Something is wrong in the received header.
+ * Follow the TCP stack's tcp_parse_options()
+ * and just bail here.
+ */
+ return ERR_PTR(-EFAULT);
+
+ kind_len = op[1];
+ if (search_kind == kind) {
+ if (!magic_len)
+ return op;
+
+ if (magic_len > kind_len - 2)
+ return ERR_PTR(-ENOMSG);
+
+ if (!memcmp(&op[2], magic, magic_len))
+ return op;
+ }
+
+ op += kind_len;
+ }
+
+ return ERR_PTR(-ENOMSG);
+}
+
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ void *, search_res, u32, len, u64, flags)
+{
+ bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
+ const u8 *op, *opend, *magic, *search = search_res;
+ u8 search_kind, search_len, copy_len, magic_len;
+ int ret;
+
+ /* 2 byte is the minimal option len except TCPOPT_NOP and
+ * TCPOPT_EOL which are useless for the bpf prog to learn
+ * and this helper disallow loading them also.
+ */
+ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
+ return -EINVAL;
+
+ search_kind = search[0];
+ search_len = search[1];
+
+ if (search_len > len || search_kind == TCPOPT_NOP ||
+ search_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (search_kind == TCPOPT_EXP || search_kind == 253) {
+ /* 16 or 32 bit magic. +2 for kind and kind length */
+ if (search_len != 4 && search_len != 6)
+ return -EINVAL;
+ magic = &search[2];
+ magic_len = search_len - 2;
+ } else {
+ if (search_len)
+ return -EINVAL;
+ magic = NULL;
+ magic_len = 0;
+ }
+
+ if (load_syn) {
+ ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
+ if (ret < 0)
+ return ret;
+
+ opend = op + ret;
+ op += sizeof(struct tcphdr);
+ } else {
+ if (!bpf_sock->skb ||
+ bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ /* This bpf_sock->op cannot call this helper */
+ return -EPERM;
+
+ opend = bpf_sock->skb_data_end;
+ op = bpf_sock->skb->data + sizeof(struct tcphdr);
+ }
+
+ op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
+ &eol);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+
+ copy_len = op[1];
+ ret = copy_len;
+ if (copy_len > len) {
+ ret = -ENOSPC;
+ copy_len = len;
+ }
+
+ memcpy(search_res, op, copy_len);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
+ .func = bpf_sock_ops_load_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ const void *, from, u32, len, u64, flags)
+{
+ u8 new_kind, new_kind_len, magic_len = 0, *opend;
+ const u8 *op, *new_op, *magic = NULL;
+ struct sk_buff *skb;
+ bool eol;
+
+ if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+ return -EPERM;
+
+ if (len < 2 || flags)
+ return -EINVAL;
+
+ new_op = from;
+ new_kind = new_op[0];
+ new_kind_len = new_op[1];
+
+ if (new_kind_len > len || new_kind == TCPOPT_NOP ||
+ new_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (new_kind_len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ /* 253 is another experimental kind */
+ if (new_kind == TCPOPT_EXP || new_kind == 253) {
+ if (new_kind_len < 4)
+ return -EINVAL;
+ /* Match for the 2 byte magic also.
+ * RFC 6994: the magic could be 2 or 4 bytes.
+ * Hence, matching by 2 byte only is on the
+ * conservative side but it is the right
+ * thing to do for the 'search-for-duplication'
+ * purpose.
+ */
+ magic = &new_op[2];
+ magic_len = 2;
+ }
+
+ /* Check for duplication */
+ skb = bpf_sock->skb;
+ op = skb->data + sizeof(struct tcphdr);
+ opend = bpf_sock->skb_data_end;
+
+ op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
+ &eol);
+ if (!IS_ERR(op))
+ return -EEXIST;
+
+ if (PTR_ERR(op) != -ENOMSG)
+ return PTR_ERR(op);
+
+ if (eol)
+ /* The option has been ended. Treat it as no more
+ * header option can be written.
+ */
+ return -ENOSPC;
+
+ /* No duplication found. Store the header option. */
+ memcpy(opend, from, new_kind_len);
+
+ bpf_sock->remaining_opt_len -= new_kind_len;
+ bpf_sock->skb_data_end += new_kind_len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
+ .func = bpf_sock_ops_store_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ u32, len, u64, flags)
+{
+ if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ return -EPERM;
+
+ if (flags || len < 2)
+ return -EINVAL;
+
+ if (len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ bpf_sock->remaining_opt_len -= len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
+ .func = bpf_sock_ops_reserve_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -6179,6 +6510,9 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
+#ifdef CONFIG_INET
+ func == bpf_sock_ops_store_hdr_opt ||
+#endif
func == bpf_lwt_in_push_encap ||
func == bpf_lwt_xmit_push_encap)
return true;
@@ -6550,6 +6884,12 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_INET
+ case BPF_FUNC_load_hdr_opt:
+ return &bpf_sock_ops_load_hdr_opt_proto;
+ case BPF_FUNC_store_hdr_opt:
+ return &bpf_sock_ops_store_hdr_opt_proto;
+ case BPF_FUNC_reserve_hdr_opt:
+ return &bpf_sock_ops_reserve_hdr_opt_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
@@ -7349,6 +7689,20 @@ static bool sock_ops_is_valid_access(int off, int size,
return false;
info->reg_type = PTR_TO_SOCKET_OR_NULL;
break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size,
+ size_default);
default:
if (size != size_default)
return false;
@@ -8450,17 +8804,22 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
switch (si->off) {
- case offsetof(struct bpf_sock_ops, op) ...
+ case offsetof(struct bpf_sock_ops, op):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ op),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, op));
+ break;
+
+ case offsetof(struct bpf_sock_ops, replylong[0]) ...
offsetof(struct bpf_sock_ops, replylong[3]):
- BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) !=
- sizeof_field(struct bpf_sock_ops_kern, op));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
sizeof_field(struct bpf_sock_ops_kern, reply));
BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
sizeof_field(struct bpf_sock_ops_kern, replylong));
off = si->off;
- off -= offsetof(struct bpf_sock_ops, op);
- off += offsetof(struct bpf_sock_ops_kern, op);
+ off -= offsetof(struct bpf_sock_ops, replylong[0]);
+ off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
if (type == BPF_WRITE)
*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
off);
@@ -8681,6 +9040,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, sk):
SOCK_OPS_GET_SK();
break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb_data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb_data_end));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, data));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_len):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, len));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ off = offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, tcp_flags);
+ *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
+ tcp_flags),
+ si->dst_reg, si->dst_reg, off);
+ break;
}
return insn - insn_buf;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 31f3b858db81..57a568875539 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -418,6 +418,8 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -2685,6 +2687,8 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_backoff = 0;
icsk->icsk_probes_out = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd = TCP_INIT_CWND;
tp->snd_cwnd_cnt = 0;
@@ -3207,7 +3211,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname,
break;
case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
+ /* 0: disable, 1: enable, 2: start from ether_header */
+ if (val < 0 || val > 2)
err = -EINVAL;
else
tp->save_syn = val;
@@ -3788,20 +3793,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
lock_sock(sk);
if (tp->saved_syn) {
- if (len < tp->saved_syn[0]) {
- if (put_user(tp->saved_syn[0], optlen)) {
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
+ if (put_user(tcp_saved_syn_len(tp->saved_syn),
+ optlen)) {
release_sock(sk);
return -EFAULT;
}
release_sock(sk);
return -EINVAL;
}
- len = tp->saved_syn[0];
+ len = tcp_saved_syn_len(tp->saved_syn);
if (put_user(len, optlen)) {
release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
+ if (copy_to_user(optval, tp->saved_syn->data, len)) {
release_sock(sk);
return -EFAULT;
}
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 09b62de04eea..af2814c9342a 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -295,7 +295,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
refcount_set(&req->rsk_refcnt, 2);
/* Now finish processing the fastopen child socket. */
- tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 184ea556f50e..4337841faeff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -138,6 +138,69 @@ void clean_acked_data_flush(void)
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
#endif
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+ struct bpf_sock_ops_kern sock_ops;
+
+ if (likely(!unknown_opt && !parse_all_opt))
+ return;
+
+ /* The skb will be handled in the
+ * bpf_skops_established() or
+ * bpf_skops_write_hdr_opt().
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_LISTEN:
+ return;
+ }
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = bpf_op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+}
+#endif
+
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
unsigned int len)
{
@@ -3801,7 +3864,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
-static void smc_parse_options(const struct tcphdr *th,
+static bool smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
@@ -3810,10 +3873,13 @@ static void smc_parse_options(const struct tcphdr *th,
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
+ return true;
+ }
}
#endif
+ return false;
}
/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
@@ -3874,6 +3940,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->saw_unknown = 0;
while (length > 0) {
int opcode = *ptr++;
@@ -3964,15 +4031,21 @@ void tcp_parse_options(const struct net *net,
*/
if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
get_unaligned_be16(ptr) ==
- TCPOPT_FASTOPEN_MAGIC)
+ TCPOPT_FASTOPEN_MAGIC) {
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
- else
- smc_parse_options(th, opt_rx, ptr,
- opsize);
+ break;
+ }
+
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
+ break;
+
+ opt_rx->saw_unknown = 1;
break;
+ default:
+ opt_rx->saw_unknown = 1;
}
ptr += opsize-2;
length -= opsize;
@@ -5590,6 +5663,8 @@ syn_challenge:
goto discard;
}
+ bpf_skops_parse_hdr(sk, skb);
+
return true;
discard:
@@ -5798,7 +5873,7 @@ discard:
}
EXPORT_SYMBOL(tcp_rcv_established);
-void tcp_init_transfer(struct sock *sk, int bpf_op)
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -5819,7 +5894,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
tp->snd_cwnd_stamp = tcp_jiffies32;
- tcp_call_bpf(sk, bpf_op, 0, NULL);
+ bpf_skops_established(sk, bpf_op, skb);
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
@@ -5838,7 +5913,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
}
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -6310,7 +6385,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
} else {
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+ skb);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
@@ -6599,13 +6675,27 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
- u32 *copy;
+ struct saved_syn *saved_syn;
+ u32 mac_hdrlen;
+ void *base;
+
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
+ base = skb_mac_header(skb);
+ mac_hdrlen = skb_mac_header_len(skb);
+ len += mac_hdrlen;
+ } else {
+ base = skb_network_header(skb);
+ mac_hdrlen = 0;
+ }
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
- if (copy) {
- copy[0] = len;
- memcpy(&copy[1], skb_network_header(skb), len);
- req->saved_syn = copy;
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
+ GFP_ATOMIC);
+ if (saved_syn) {
+ saved_syn->mac_hdrlen = mac_hdrlen;
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+ memcpy(saved_syn->data, base, len);
+ req->saved_syn = saved_syn;
}
}
}
@@ -6752,7 +6842,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, TCP_SYNACK_FASTOPEN);
+ &foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
reqsk_fastopen_remove(fastopen_sk, req, false);
@@ -6770,7 +6860,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_timeout_init((struct sock *)req));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
- TCP_SYNACK_COOKIE);
+ TCP_SYNACK_COOKIE,
+ skb);
if (want_cookie) {
reqsk_free(req);
return 0;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5084333b5ab6..631a5ee0dd4e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -965,7 +965,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -976,7 +977,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 495dda2449fe..56c306e3cd2f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -548,6 +548,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+ bpf_skops_init_child(sk, newsk);
tcp_bpf_clone(sk, newsk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 85ff417bda7f..ab79d36ed07f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -438,6 +438,7 @@ struct tcp_out_options {
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
@@ -452,6 +453,145 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts)
#endif
}
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
+#endif
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -691,6 +831,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -701,7 +843,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
struct tcp_out_options *opts,
const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
@@ -758,6 +901,9 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -826,6 +972,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
+ }
+
return size;
}
@@ -1213,6 +1368,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
#endif
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
+
INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
tcp_v6_send_check, tcp_v4_send_check,
sk, skb);
@@ -3336,20 +3494,20 @@ int tcp_send_synack(struct sock *sk)
}
/**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- * foc: cookie for tcp fast open
- * synack_type: Type of synback to prepare
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3408,8 +3566,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc, synack_type) + sizeof(*th);
+ foc, synack_type,
+ syn_skb) + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3441,6 +3602,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_unlock();
#endif
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
skb->skb_mstamp_ns = now;
tcp_add_tx_delay(skb, tp);
@@ -3741,6 +3905,8 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max);
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
@@ -3934,7 +4100,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
if (!res) {
__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 305870a72352..87a633e1fbef 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
@@ -515,7 +516,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,