diff options
| -rw-r--r-- | include/linux/filter.h | 1 | ||||
| -rw-r--r-- | include/linux/skbuff.h | 12 | ||||
| -rw-r--r-- | include/net/sock.h | 10 | ||||
| -rw-r--r-- | include/net/tcp.h | 7 | ||||
| -rw-r--r-- | include/uapi/linux/bpf.h | 30 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 1 | ||||
| -rw-r--r-- | net/core/dev.c | 3 | ||||
| -rw-r--r-- | net/core/filter.c | 79 | ||||
| -rw-r--r-- | net/core/skbuff.c | 53 | ||||
| -rw-r--r-- | net/core/sock.c | 14 | ||||
| -rw-r--r-- | net/dsa/user.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 6 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 2 | ||||
| -rw-r--r-- | net/socket.c | 2 | ||||
| -rw-r--r-- | tools/include/uapi/linux/bpf.h | 30 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/prog_tests/net_timestamping.c | 239 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/progs/net_timestamping.c | 248 |
18 files changed, 727 insertions, 14 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h index a3ea46281595..d36d5d5180b1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern { void *skb_data_end; u8 op; u8 is_fullsock; + u8 is_locked_tcp_sock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bb2b751d274a..0b4f1889500d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -470,7 +470,7 @@ struct skb_shared_hwtstamps { /* Definitions for tx_flags in struct skb_shared_info */ enum { /* generate hardware time stamp */ - SKBTX_HW_TSTAMP = 1 << 0, + SKBTX_HW_TSTAMP_NOBPF = 1 << 0, /* generate software time stamp when queueing packet to NIC */ SKBTX_SW_TSTAMP = 1 << 1, @@ -489,10 +489,16 @@ enum { /* generate software time stamp when entering packet scheduling */ SKBTX_SCHED_TSTAMP = 1 << 6, + + /* used for bpf extension when a bpf program is loaded */ + SKBTX_BPF = 1 << 7, }; +#define SKBTX_HW_TSTAMP (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF) + #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ - SKBTX_SCHED_TSTAMP) + SKBTX_SCHED_TSTAMP | \ + SKBTX_BPF) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \ SKBTX_HW_TSTAMP_USE_CYCLES | \ SKBTX_ANY_SW_TSTAMP) @@ -4564,7 +4570,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, static inline void skb_tx_timestamp(struct sk_buff *skb) { skb_clone_tx_timestamp(skb); - if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) + if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF)) skb_tstamp_tx(skb, NULL); } diff --git a/include/net/sock.h b/include/net/sock.h index 60ebf3c7b229..2f6b55c59c16 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -303,6 +303,7 @@ struct sk_filter; * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. @@ -525,6 +526,8 @@ struct sock { u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) + u8 sk_bpf_cb_flags; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -2921,6 +2924,13 @@ int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); void sock_enable_timestamps(struct sock *sk); +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); +#else +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ +} +#endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); diff --git a/include/net/tcp.h b/include/net/tcp.h index 7fd2d7fa4532..ae6c95b01012 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -964,10 +964,12 @@ struct tcp_skb_cb { __u8 sacked; /* State flags for SACK. */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ - __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ +#define TSTAMP_ACK_SK 0x1 +#define TSTAMP_ACK_BPF 0x2 + __u8 txstamp_ack:2, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ - unused:5; + unused:4; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { @@ -2657,6 +2659,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_owned_by_me(sk); } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2acf9b336371..defa5bb881f4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6913,6 +6913,12 @@ enum { BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; +enum { + SK_BPF_CB_TX_TIMESTAMPING = 1<<0, + SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) | + SK_BPF_CB_TX_TIMESTAMPING +}; + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -7025,6 +7031,29 @@ enum { * by the kernel or the * earlier bpf-progs. */ + BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing + * through dev layer when + * SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send + * to the nic when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when + * SK_BPF_CB_TX_TIMESTAMPING feature + * is on. + */ + BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the + * same sendmsg call are acked + * when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall + * is triggered. It's used to correlate + * sendmsg timestamp with corresponding + * tskey. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -7091,6 +7120,7 @@ enum { TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ + SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ }; enum { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9de6acddd479..551eedb87d58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8524,6 +8524,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; diff --git a/net/core/dev.c b/net/core/dev.c index d5ab9a4b318e..436f2bdfb2d5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4501,7 +4501,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); skb_assert_len(skb); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also diff --git a/net/core/filter.c b/net/core/filter.c index ffec7b4357f9..a0867c5b32b3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5222,6 +5222,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5238,6 +5257,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: + case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5247,6 +5267,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; @@ -5501,6 +5524,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } +static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) +{ + return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; +} + static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5651,6 +5679,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -5736,6 +5767,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; @@ -5778,6 +5812,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; @@ -7587,6 +7624,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u8 search_kind, search_len, copy_len, magic_len; int ret; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. @@ -10359,10 +10399,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ @@ -10447,10 +10487,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ @@ -12063,6 +12103,25 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, #endif } +__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, + u64 flags) +{ + struct sk_buff *skb; + + if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) + return -EOPNOTSUPP; + + if (flags) + return -EINVAL; + + skb = skops->skb; + skb_shinfo(skb)->tx_flags |= SKBTX_BPF; + TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; + skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12096,6 +12155,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -12116,6 +12179,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .set = &bpf_kfunc_check_set_tcp_reqsk, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_ops, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -12134,7 +12202,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c..dd33c12f00ca 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5539,6 +5539,52 @@ err: } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); +static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + int tstype) +{ + switch (tstype) { + case SCM_TSTAMP_SCHED: + return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; + case SCM_TSTAMP_SND: + return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : + SKBTX_SW_TSTAMP); + case SCM_TSTAMP_ACK: + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; + } + + return false; +} + +static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, + int tstype) +{ + int op; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; + break; + case SCM_TSTAMP_SND: + if (hwtstamps) { + op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; + *skb_hwtstamps(skb) = *hwtstamps; + } else { + op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; + } + break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; + default: + return; + } + + bpf_skops_tx_timestamping(sk, skb, op); +} + void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, @@ -5551,6 +5597,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) + skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, + sk, tstype); + + if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) + return; + tsflags = READ_ONCE(sk->sk_tsflags); if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) diff --git a/net/core/sock.c b/net/core/sock.c index a197f0a0b878..ba653c6a1229 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -949,6 +949,20 @@ int sock_set_timestamping(struct sock *sk, int optname, return 0; } +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); diff --git a/net/dsa/user.c b/net/dsa/user.c index 2296a4ead020..804dc7dac4f2 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -897,7 +897,7 @@ static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, { struct dsa_switch *ds = p->dp->ds; - if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 992d5c9b2487..298d1da05bee 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -492,10 +492,14 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) - tcb->txstamp_ack = 1; + tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } + + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && + SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) + bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } static bool tcp_stream_is_readable(struct sock *sk, int target) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4686783b70de..4a9e70e23ef8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -169,6 +169,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); @@ -185,6 +186,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 464232a0d637..a796cf451e55 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -525,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -570,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } diff --git a/net/socket.c b/net/socket.c index 28bae5a94234..0545e9ea7058 100644 --- a/net/socket.c +++ b/net/socket.c @@ -681,7 +681,7 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) u8 flags = *tx_flags; if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { - flags |= SKBTX_HW_TSTAMP; + flags |= SKBTX_HW_TSTAMP_NOBPF; /* PTP hardware clocks can provide a free running cycle counter * as a time base for virtual clocks. Tell driver to use the diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2acf9b336371..defa5bb881f4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6913,6 +6913,12 @@ enum { BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; +enum { + SK_BPF_CB_TX_TIMESTAMPING = 1<<0, + SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) | + SK_BPF_CB_TX_TIMESTAMPING +}; + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -7025,6 +7031,29 @@ enum { * by the kernel or the * earlier bpf-progs. */ + BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing + * through dev layer when + * SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send + * to the nic when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when + * SK_BPF_CB_TX_TIMESTAMPING feature + * is on. + */ + BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the + * same sendmsg call are acked + * when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall + * is triggered. It's used to correlate + * sendmsg timestamp with corresponding + * tskey. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -7091,6 +7120,7 @@ enum { TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ + SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ }; enum { diff --git a/tools/testing/selftests/bpf/prog_tests/net_timestamping.c b/tools/testing/selftests/bpf/prog_tests/net_timestamping.c new file mode 100644 index 000000000000..dbfd87499b6b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/net_timestamping.c @@ -0,0 +1,239 @@ +#include <linux/net_tstamp.h> +#include <sys/time.h> +#include <linux/errqueue.h> +#include "test_progs.h" +#include "network_helpers.h" +#include "net_timestamping.skel.h" + +#define CG_NAME "/net-timestamping-test" +#define NSEC_PER_SEC 1000000000LL + +static const char addr4_str[] = "127.0.0.1"; +static const char addr6_str[] = "::1"; +static struct net_timestamping *skel; +static const int cfg_payload_len = 30; +static struct timespec usr_ts; +static u64 delay_tolerance_nsec = 10000000000; /* 10 seconds */ +int SK_TS_SCHED; +int SK_TS_TXSW; +int SK_TS_ACK; + +static int64_t timespec_to_ns64(struct timespec *ts) +{ + return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec; +} + +static void validate_key(int tskey, int tstype) +{ + static int expected_tskey = -1; + + if (tstype == SCM_TSTAMP_SCHED) + expected_tskey = cfg_payload_len - 1; + + ASSERT_EQ(expected_tskey, tskey, "tskey mismatch"); + + expected_tskey = tskey; +} + +static void validate_timestamp(struct timespec *cur, struct timespec *prev) +{ + int64_t cur_ns, prev_ns; + + cur_ns = timespec_to_ns64(cur); + prev_ns = timespec_to_ns64(prev); + + ASSERT_LT(cur_ns - prev_ns, delay_tolerance_nsec, "latency"); +} + +static void test_socket_timestamp(struct scm_timestamping *tss, int tstype, + int tskey) +{ + static struct timespec prev_ts; + + validate_key(tskey, tstype); + + switch (tstype) { + case SCM_TSTAMP_SCHED: + validate_timestamp(&tss->ts[0], &usr_ts); + SK_TS_SCHED += 1; + break; + case SCM_TSTAMP_SND: + validate_timestamp(&tss->ts[0], &prev_ts); + SK_TS_TXSW += 1; + break; + case SCM_TSTAMP_ACK: + validate_timestamp(&tss->ts[0], &prev_ts); + SK_TS_ACK += 1; + break; + } + + prev_ts = tss->ts[0]; +} + +static void test_recv_errmsg_cmsg(struct msghdr *msg) +{ + struct sock_extended_err *serr = NULL; + struct scm_timestamping *tss = NULL; + struct cmsghdr *cm; + + for (cm = CMSG_FIRSTHDR(msg); + cm && cm->cmsg_len; + cm = CMSG_NXTHDR(msg, cm)) { + if (cm->cmsg_level == SOL_SOCKET && + cm->cmsg_type == SCM_TIMESTAMPING) { + tss = (void *)CMSG_DATA(cm); + } else if ((cm->cmsg_level == SOL_IP && + cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_RECVERR) || + (cm->cmsg_level == SOL_PACKET && + cm->cmsg_type == PACKET_TX_TIMESTAMP)) { + serr = (void *)CMSG_DATA(cm); + ASSERT_EQ(serr->ee_origin, SO_EE_ORIGIN_TIMESTAMPING, + "cmsg type"); + } + + if (serr && tss) + test_socket_timestamp(tss, serr->ee_info, + serr->ee_data); + } +} + +static bool socket_recv_errmsg(int fd) +{ + static char ctrl[1024 /* overprovision*/]; + char data[cfg_payload_len]; + static struct msghdr msg; + struct iovec entry; + int n = 0; + + memset(&msg, 0, sizeof(msg)); + memset(&entry, 0, sizeof(entry)); + memset(ctrl, 0, sizeof(ctrl)); + + entry.iov_base = data; + entry.iov_len = cfg_payload_len; + msg.msg_iov = &entry; + msg.msg_iovlen = 1; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + n = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (n == -1) + ASSERT_EQ(errno, EAGAIN, "recvmsg MSG_ERRQUEUE"); + + if (n >= 0) + test_recv_errmsg_cmsg(&msg); + + return n == -1; +} + +static void test_socket_timestamping(int fd) +{ + while (!socket_recv_errmsg(fd)); + + ASSERT_EQ(SK_TS_SCHED, 1, "SCM_TSTAMP_SCHED"); + ASSERT_EQ(SK_TS_TXSW, 1, "SCM_TSTAMP_SND"); + ASSERT_EQ(SK_TS_ACK, 1, "SCM_TSTAMP_ACK"); + + SK_TS_SCHED = 0; + SK_TS_TXSW = 0; + SK_TS_ACK = 0; +} + +static void test_tcp(int family, bool enable_socket_timestamping) +{ + struct net_timestamping__bss *bss; + char buf[cfg_payload_len]; + int sfd = -1, cfd = -1; + unsigned int sock_opt; + struct netns_obj *ns; + int cg_fd; + int ret; + + cg_fd = test__join_cgroup(CG_NAME); + if (!ASSERT_OK_FD(cg_fd, "join cgroup")) + return; + + ns = netns_new("net_timestamping_ns", true); + if (!ASSERT_OK_PTR(ns, "create ns")) + goto out; + + skel = net_timestamping__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skel")) + goto out; + + if (!ASSERT_OK(net_timestamping__attach(skel), "attach skel")) + goto out; + + skel->links.skops_sockopt = + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd); + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup")) + goto out; + + bss = skel->bss; + memset(bss, 0, sizeof(*bss)); + + skel->bss->monitored_pid = getpid(); + + sfd = start_server(family, SOCK_STREAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_OK_FD(sfd, "start_server")) + goto out; + + cfd = connect_to_fd(sfd, 0); + if (!ASSERT_OK_FD(cfd, "connect_to_fd_server")) + goto out; + + if (enable_socket_timestamping) { + sock_opt = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID | + SOF_TIMESTAMPING_TX_SCHED | + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK; + ret = setsockopt(cfd, SOL_SOCKET, SO_TIMESTAMPING, + (char *) &sock_opt, sizeof(sock_opt)); + if (!ASSERT_OK(ret, "setsockopt SO_TIMESTAMPING")) + goto out; + + ret = clock_gettime(CLOCK_REALTIME, &usr_ts); + if (!ASSERT_OK(ret, "get user time")) + goto out; + } + + ret = write(cfd, buf, sizeof(buf)); + if (!ASSERT_EQ(ret, sizeof(buf), "send to server")) + goto out; + + if (enable_socket_timestamping) + test_socket_timestamping(cfd); + + ASSERT_EQ(bss->nr_active, 1, "nr_active"); + ASSERT_EQ(bss->nr_snd, 2, "nr_snd"); + ASSERT_EQ(bss->nr_sched, 1, "nr_sched"); + ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw"); + ASSERT_EQ(bss->nr_ack, 1, "nr_ack"); + +out: + if (sfd >= 0) + close(sfd); + if (cfd >= 0) + close(cfd); + net_timestamping__destroy(skel); + netns_free(ns); + close(cg_fd); +} + +void test_net_timestamping(void) +{ + if (test__start_subtest("INET4: bpf timestamping")) + test_tcp(AF_INET, false); + if (test__start_subtest("INET4: bpf and socket timestamping")) + test_tcp(AF_INET, true); + if (test__start_subtest("INET6: bpf timestamping")) + test_tcp(AF_INET6, false); + if (test__start_subtest("INET6: bpf and socket timestamping")) + test_tcp(AF_INET6, true); +} diff --git a/tools/testing/selftests/bpf/progs/net_timestamping.c b/tools/testing/selftests/bpf/progs/net_timestamping.c new file mode 100644 index 000000000000..b4c2f0f2be11 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/net_timestamping.c @@ -0,0 +1,248 @@ +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include <errno.h> + +__u32 monitored_pid = 0; + +int nr_active; +int nr_snd; +int nr_passive; +int nr_sched; +int nr_txsw; +int nr_ack; + +struct sk_stg { + __u64 sendmsg_ns; /* record ts when sendmsg is called */ +}; + +struct sk_tskey { + u64 cookie; + u32 tskey; +}; + +struct delay_info { + u64 sendmsg_ns; /* record ts when sendmsg is called */ + u32 sched_delay; /* SCHED_CB - sendmsg_ns */ + u32 snd_sw_delay; /* SND_SW_CB - SCHED_CB */ + u32 ack_delay; /* ACK_CB - SND_SW_CB */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct sk_stg); +} sk_stg_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sk_tskey); + __type(value, struct delay_info); + __uint(max_entries, 1024); +} time_map SEC(".maps"); + +static u64 delay_tolerance_nsec = 10000000000; /* 10 second as an example */ + +extern int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, u64 flags) __ksym; + +static int bpf_test_sockopt(void *ctx, const struct sock *sk, int expected) +{ + int tmp, new = SK_BPF_CB_TX_TIMESTAMPING; + int opt = SK_BPF_CB_FLAGS; + int level = SOL_SOCKET; + + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)) != expected) + return 1; + + if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) != expected || + (!expected && tmp != new)) + return 1; + + return 0; +} + +static bool bpf_test_access_sockopt(void *ctx, const struct sock *sk) +{ + if (bpf_test_sockopt(ctx, sk, -EOPNOTSUPP)) + return true; + return false; +} + +static bool bpf_test_access_load_hdr_opt(struct bpf_sock_ops *skops) +{ + u8 opt[3] = {0}; + int load_flags = 0; + int ret; + + ret = bpf_load_hdr_opt(skops, opt, sizeof(opt), load_flags); + if (ret != -EOPNOTSUPP) + return true; + + return false; +} + +static bool bpf_test_access_cb_flags_set(struct bpf_sock_ops *skops) +{ + int ret; + + ret = bpf_sock_ops_cb_flags_set(skops, 0); + if (ret != -EOPNOTSUPP) + return true; + + return false; +} + +/* In the timestamping callbacks, we're not allowed to call the following + * BPF CALLs for the safety concern. Return false if expected. + */ +static bool bpf_test_access_bpf_calls(struct bpf_sock_ops *skops, + const struct sock *sk) +{ + if (bpf_test_access_sockopt(skops, sk)) + return true; + + if (bpf_test_access_load_hdr_opt(skops)) + return true; + + if (bpf_test_access_cb_flags_set(skops)) + return true; + + return false; +} + +static bool bpf_test_delay(struct bpf_sock_ops *skops, const struct sock *sk) +{ + struct bpf_sock_ops_kern *skops_kern; + u64 timestamp = bpf_ktime_get_ns(); + struct skb_shared_info *shinfo; + struct delay_info dinfo = {0}; + struct sk_tskey key = {0}; + struct delay_info *val; + struct sk_buff *skb; + struct sk_stg *stg; + u64 prior_ts, delay; + + if (bpf_test_access_bpf_calls(skops, sk)) + return false; + + skops_kern = bpf_cast_to_kern_ctx(skops); + skb = skops_kern->skb; + shinfo = bpf_core_cast(skb->head + skb->end, struct skb_shared_info); + + key.cookie = bpf_get_socket_cookie(skops); + if (!key.cookie) + return false; + + if (skops->op == BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) { + stg = bpf_sk_storage_get(&sk_stg_map, (void *)sk, 0, 0); + if (!stg) + return false; + dinfo.sendmsg_ns = stg->sendmsg_ns; + bpf_sock_ops_enable_tx_tstamp(skops_kern, 0); + key.tskey = shinfo->tskey; + if (!key.tskey) + return false; + bpf_map_update_elem(&time_map, &key, &dinfo, BPF_ANY); + return true; + } + + key.tskey = shinfo->tskey; + if (!key.tskey) + return false; + + val = bpf_map_lookup_elem(&time_map, &key); + if (!val) + return false; + + switch (skops->op) { + case BPF_SOCK_OPS_TSTAMP_SCHED_CB: + val->sched_delay = timestamp - val->sendmsg_ns; + delay = val->sched_delay; + break; + case BPF_SOCK_OPS_TSTAMP_SND_SW_CB: + prior_ts = val->sched_delay + val->sendmsg_ns; + val->snd_sw_delay = timestamp - prior_ts; + delay = val->snd_sw_delay; + break; + case BPF_SOCK_OPS_TSTAMP_ACK_CB: + prior_ts = val->snd_sw_delay + val->sched_delay + val->sendmsg_ns; + val->ack_delay = timestamp - prior_ts; + delay = val->ack_delay; + break; + } + + if (delay >= delay_tolerance_nsec) + return false; + + /* Since it's the last one, remove from the map after latency check */ + if (skops->op == BPF_SOCK_OPS_TSTAMP_ACK_CB) + bpf_map_delete_elem(&time_map, &key); + + return true; +} + +SEC("fentry/tcp_sendmsg_locked") +int BPF_PROG(trace_tcp_sendmsg_locked, struct sock *sk, struct msghdr *msg, + size_t size) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 timestamp = bpf_ktime_get_ns(); + u32 flag = sk->sk_bpf_cb_flags; + struct sk_stg *stg; + + if (pid != monitored_pid || !flag) + return 0; + + stg = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!stg) + return 0; + + stg->sendmsg_ns = timestamp; + nr_snd += 1; + return 0; +} + +SEC("sockops") +int skops_sockopt(struct bpf_sock_ops *skops) +{ + struct bpf_sock *bpf_sk = skops->sk; + const struct sock *sk; + + if (!bpf_sk) + return 1; + + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk); + if (!sk) + return 1; + + switch (skops->op) { + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + nr_active += !bpf_test_sockopt(skops, sk, 0); + break; + case BPF_SOCK_OPS_TSTAMP_SENDMSG_CB: + if (bpf_test_delay(skops, sk)) + nr_snd += 1; + break; + case BPF_SOCK_OPS_TSTAMP_SCHED_CB: + if (bpf_test_delay(skops, sk)) + nr_sched += 1; + break; + case BPF_SOCK_OPS_TSTAMP_SND_SW_CB: + if (bpf_test_delay(skops, sk)) + nr_txsw += 1; + break; + case BPF_SOCK_OPS_TSTAMP_ACK_CB: + if (bpf_test_delay(skops, sk)) + nr_ack += 1; + break; + } + + return 1; +} + +char _license[] SEC("license") = "GPL"; |
