// SPDX-License-Identifier: GPL-2.0 /* OpenVPN data channel offload * * Copyright (C) 2019-2025 OpenVPN, Inc. * * Author: Antonio Quartulli */ #include #include #include #include #include #include #include #include #include "ovpnpriv.h" #include "main.h" #include "io.h" #include "peer.h" #include "proto.h" #include "skb.h" #include "tcp.h" #define OVPN_TCP_DEPTH_NESTING 2 #if OVPN_TCP_DEPTH_NESTING == SINGLE_DEPTH_NESTING #error "OVPN TCP requires its own lockdep subclass" #endif static struct proto ovpn_tcp_prot __ro_after_init; static struct proto_ops ovpn_tcp_ops __ro_after_init; static struct proto ovpn_tcp6_prot __ro_after_init; static struct proto_ops ovpn_tcp6_ops __ro_after_init; static int ovpn_tcp_parse(struct strparser *strp, struct sk_buff *skb) { struct strp_msg *rxm = strp_msg(skb); __be16 blen; u16 len; int err; /* when packets are written to the TCP stream, they are prepended with * two bytes indicating the actual packet size. * Parse accordingly and return the actual size (including the size * header) */ if (skb->len < rxm->offset + 2) return 0; err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); if (err < 0) return err; len = be16_to_cpu(blen); if (len < 2) return -EINVAL; return len + 2; } /* queue skb for sending to userspace via recvmsg on the socket */ static void ovpn_tcp_to_userspace(struct ovpn_peer *peer, struct sock *sk, struct sk_buff *skb) { skb_set_owner_r(skb, sk); memset(skb->cb, 0, sizeof(skb->cb)); skb_queue_tail(&peer->tcp.user_queue, skb); peer->tcp.sk_cb.sk_data_ready(sk); } static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb) { struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp); struct strp_msg *msg = strp_msg(skb); size_t pkt_len = msg->full_len - 2; size_t off = msg->offset + 2; u8 opcode; /* ensure skb->data points to the beginning of the openvpn packet */ if (!pskb_pull(skb, off)) { net_warn_ratelimited("%s: packet too small for peer %u\n", netdev_name(peer->ovpn->dev), peer->id); goto err; } /* strparser does not trim the skb for us, therefore we do it now */ if (pskb_trim(skb, pkt_len) != 0) { net_warn_ratelimited("%s: trimming skb failed for peer %u\n", netdev_name(peer->ovpn->dev), peer->id); goto err; } /* we need the first 4 bytes of data to be accessible * to extract the opcode and the key ID later on */ if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) { net_warn_ratelimited("%s: packet too small to fetch opcode for peer %u\n", netdev_name(peer->ovpn->dev), peer->id); goto err; } /* DATA_V2 packets are handled in kernel, the rest goes to user space */ opcode = ovpn_opcode_from_skb(skb, 0); if (unlikely(opcode != OVPN_DATA_V2)) { if (opcode == OVPN_DATA_V1) { net_warn_ratelimited("%s: DATA_V1 detected on the TCP stream\n", netdev_name(peer->ovpn->dev)); goto err; } /* The packet size header must be there when sending the packet * to userspace, therefore we put it back */ skb_push(skb, 2); ovpn_tcp_to_userspace(peer, strp->sk, skb); return; } /* hold reference to peer as required by ovpn_recv(). * * NOTE: in this context we should already be holding a reference to * this peer, therefore ovpn_peer_hold() is not expected to fail */ if (WARN_ON(!ovpn_peer_hold(peer))) goto err; ovpn_recv(peer, skb); return; err: dev_dstats_rx_dropped(peer->ovpn->dev); kfree_skb(skb); ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); } static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int err = 0, off, copied = 0, ret; struct ovpn_socket *sock; struct ovpn_peer *peer; struct sk_buff *skb; rcu_read_lock(); sock = rcu_dereference_sk_user_data(sk); if (unlikely(!sock || !sock->peer || !ovpn_peer_hold(sock->peer))) { rcu_read_unlock(); return -EBADF; } peer = sock->peer; rcu_read_unlock(); skb = __skb_recv_datagram(sk, &peer->tcp.user_queue, flags, &off, &err); if (!skb) { if (err == -EAGAIN && sk->sk_shutdown & RCV_SHUTDOWN) { ret = 0; goto out; } ret = err; goto out; } copied = len; if (copied > skb->len) copied = skb->len; else if (copied < skb->len) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); ret = err; goto out; } if (flags & MSG_TRUNC) copied = skb->len; kfree_skb(skb); ret = copied; out: ovpn_peer_put(peer); return ret; } void ovpn_tcp_socket_detach(struct ovpn_socket *ovpn_sock) { struct ovpn_peer *peer = ovpn_sock->peer; struct socket *sock = ovpn_sock->sock; strp_stop(&peer->tcp.strp); skb_queue_purge(&peer->tcp.user_queue); /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */ sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready; sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space; sock->sk->sk_prot = peer->tcp.sk_cb.prot; sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops; rcu_assign_sk_user_data(sock->sk, NULL); } void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock) { struct ovpn_peer *peer = sock->peer; /* NOTE: we don't wait for peer->tcp.defer_del_work to finish: * either the worker is not running or this function * was invoked by that worker. */ cancel_work_sync(&sock->tcp_tx_work); strp_done(&peer->tcp.strp); skb_queue_purge(&peer->tcp.out_queue); kfree_skb(peer->tcp.out_msg.skb); peer->tcp.out_msg.skb = NULL; } static void ovpn_tcp_send_sock(struct ovpn_peer *peer, struct sock *sk) { struct sk_buff *skb = peer->tcp.out_msg.skb; int ret, flags; if (!skb) return; if (peer->tcp.tx_in_progress) return; peer->tcp.tx_in_progress = true; do { flags = ovpn_skb_cb(skb)->nosignal ? MSG_NOSIGNAL : 0; ret = skb_send_sock_locked_with_flags(sk, skb, peer->tcp.out_msg.offset, peer->tcp.out_msg.len, flags); if (unlikely(ret < 0)) { if (ret == -EAGAIN) goto out; net_warn_ratelimited("%s: TCP error to peer %u: %d\n", netdev_name(peer->ovpn->dev), peer->id, ret); /* in case of TCP error we can't recover the VPN * stream therefore we abort the connection */ ovpn_peer_hold(peer); schedule_work(&peer->tcp.defer_del_work); /* we bail out immediately and keep tx_in_progress set * to true. This way we prevent more TX attempts * which would lead to more invocations of * schedule_work() */ return; } peer->tcp.out_msg.len -= ret; peer->tcp.out_msg.offset += ret; } while (peer->tcp.out_msg.len > 0); if (!peer->tcp.out_msg.len) { preempt_disable(); dev_dstats_tx_add(peer->ovpn->dev, skb->len); preempt_enable(); } kfree_skb(peer->tcp.out_msg.skb); peer->tcp.out_msg.skb = NULL; peer->tcp.out_msg.len = 0; peer->tcp.out_msg.offset = 0; out: peer->tcp.tx_in_progress = false; } void ovpn_tcp_tx_work(struct work_struct *work) { struct ovpn_socket *sock; sock = container_of(work, struct ovpn_socket, tcp_tx_work); lock_sock(sock->sock->sk); if (sock->peer) ovpn_tcp_send_sock(sock->peer, sock->sock->sk); release_sock(sock->sock->sk); } static void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sock *sk, struct sk_buff *skb) { if (peer->tcp.out_msg.skb) ovpn_tcp_send_sock(peer, sk); if (peer->tcp.out_msg.skb) { dev_dstats_tx_dropped(peer->ovpn->dev); kfree_skb(skb); return; } peer->tcp.out_msg.skb = skb; peer->tcp.out_msg.len = skb->len; peer->tcp.out_msg.offset = 0; ovpn_tcp_send_sock(peer, sk); } void ovpn_tcp_send_skb(struct ovpn_peer *peer, struct socket *sock, struct sk_buff *skb) { u16 len = skb->len; *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len); spin_lock_nested(&sock->sk->sk_lock.slock, OVPN_TCP_DEPTH_NESTING); if (sock_owned_by_user(sock->sk)) { if (skb_queue_len(&peer->tcp.out_queue) >= READ_ONCE(net_hotdata.max_backlog)) { dev_dstats_tx_dropped(peer->ovpn->dev); kfree_skb(skb); goto unlock; } __skb_queue_tail(&peer->tcp.out_queue, skb); } else { ovpn_tcp_send_sock_skb(peer, sock->sk, skb); } unlock: spin_unlock(&sock->sk->sk_lock.slock); } static void ovpn_tcp_release(struct sock *sk) { struct sk_buff_head queue; struct ovpn_socket *sock; struct ovpn_peer *peer; struct sk_buff *skb; rcu_read_lock(); sock = rcu_dereference_sk_user_data(sk); if (!sock) { rcu_read_unlock(); return; } peer = sock->peer; /* during initialization this function is called before * assigning sock->peer */ if (unlikely(!peer || !ovpn_peer_hold(peer))) { rcu_read_unlock(); return; } rcu_read_unlock(); __skb_queue_head_init(&queue); skb_queue_splice_init(&peer->tcp.out_queue, &queue); while ((skb = __skb_dequeue(&queue))) ovpn_tcp_send_sock_skb(peer, sk, skb); peer->tcp.sk_cb.prot->release_cb(sk); ovpn_peer_put(peer); } static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct ovpn_socket *sock; int ret, linear = PAGE_SIZE; struct ovpn_peer *peer; struct sk_buff *skb; lock_sock(sk); rcu_read_lock(); sock = rcu_dereference_sk_user_data(sk); if (unlikely(!sock || !sock->peer || !ovpn_peer_hold(sock->peer))) { rcu_read_unlock(); release_sock(sk); return -EIO; } rcu_read_unlock(); peer = sock->peer; if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL)) { ret = -EOPNOTSUPP; goto peer_free; } if (peer->tcp.out_msg.skb) { ret = -EAGAIN; goto peer_free; } if (size < linear) linear = size; skb = sock_alloc_send_pskb(sk, linear, size - linear, msg->msg_flags & MSG_DONTWAIT, &ret, 0); if (!skb) { net_err_ratelimited("%s: skb alloc failed: %d\n", netdev_name(peer->ovpn->dev), ret); goto peer_free; } skb_put(skb, linear); skb->len = size; skb->data_len = size - linear; ret = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); if (ret) { kfree_skb(skb); net_err_ratelimited("%s: skb copy from iter failed: %d\n", netdev_name(peer->ovpn->dev), ret); goto peer_free; } ovpn_skb_cb(skb)->nosignal = msg->msg_flags & MSG_NOSIGNAL; ovpn_tcp_send_sock_skb(peer, sk, skb); ret = size; peer_free: release_sock(sk); ovpn_peer_put(peer); return ret; } static int ovpn_tcp_disconnect(struct sock *sk, int flags) { return -EBUSY; } static void ovpn_tcp_data_ready(struct sock *sk) { struct ovpn_socket *sock; trace_sk_data_ready(sk); rcu_read_lock(); sock = rcu_dereference_sk_user_data(sk); if (likely(sock && sock->peer)) strp_data_ready(&sock->peer->tcp.strp); rcu_read_unlock(); } static void ovpn_tcp_write_space(struct sock *sk) { struct ovpn_socket *sock; rcu_read_lock(); sock = rcu_dereference_sk_user_data(sk); if (likely(sock && sock->peer)) { schedule_work(&sock->tcp_tx_work); sock->peer->tcp.sk_cb.sk_write_space(sk); } rcu_read_unlock(); } static void ovpn_tcp_build_protos(struct proto *new_prot, struct proto_ops *new_ops, const struct proto *orig_prot, const struct proto_ops *orig_ops); static void ovpn_tcp_peer_del_work(struct work_struct *work) { struct ovpn_peer *peer = container_of(work, struct ovpn_peer, tcp.defer_del_work); ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); ovpn_peer_put(peer); } /* Set TCP encapsulation callbacks */ int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, struct ovpn_peer *peer) { struct socket *sock = ovpn_sock->sock; struct strp_callbacks cb = { .rcv_msg = ovpn_tcp_rcv, .parse_msg = ovpn_tcp_parse, }; int ret; /* make sure no pre-existing encapsulation handler exists */ if (sock->sk->sk_user_data) return -EBUSY; /* only a fully connected socket is expected. Connection should be * handled in userspace */ if (sock->sk->sk_state != TCP_ESTABLISHED) { net_err_ratelimited("%s: provided TCP socket is not in ESTABLISHED state: %d\n", netdev_name(peer->ovpn->dev), sock->sk->sk_state); return -EINVAL; } ret = strp_init(&peer->tcp.strp, sock->sk, &cb); if (ret < 0) { DEBUG_NET_WARN_ON_ONCE(1); return ret; } INIT_WORK(&peer->tcp.defer_del_work, ovpn_tcp_peer_del_work); __sk_dst_reset(sock->sk); skb_queue_head_init(&peer->tcp.user_queue); skb_queue_head_init(&peer->tcp.out_queue); /* save current CBs so that they can be restored upon socket release */ peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready; peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space; peer->tcp.sk_cb.prot = sock->sk->sk_prot; peer->tcp.sk_cb.ops = sock->sk->sk_socket->ops; /* assign our static CBs and prot/ops */ sock->sk->sk_data_ready = ovpn_tcp_data_ready; sock->sk->sk_write_space = ovpn_tcp_write_space; if (sock->sk->sk_family == AF_INET) { sock->sk->sk_prot = &ovpn_tcp_prot; sock->sk->sk_socket->ops = &ovpn_tcp_ops; } else { sock->sk->sk_prot = &ovpn_tcp6_prot; sock->sk->sk_socket->ops = &ovpn_tcp6_ops; } /* avoid using task_frag */ sock->sk->sk_allocation = GFP_ATOMIC; sock->sk->sk_use_task_frag = false; /* enqueue the RX worker */ strp_check_rcv(&peer->tcp.strp); return 0; } static void ovpn_tcp_close(struct sock *sk, long timeout) { struct ovpn_socket *sock; struct ovpn_peer *peer; rcu_read_lock(); sock = rcu_dereference_sk_user_data(sk); if (!sock || !sock->peer || !ovpn_peer_hold(sock->peer)) { rcu_read_unlock(); return; } peer = sock->peer; rcu_read_unlock(); ovpn_peer_del(sock->peer, OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); peer->tcp.sk_cb.prot->close(sk, timeout); ovpn_peer_put(peer); } static __poll_t ovpn_tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask = datagram_poll(file, sock, wait); struct ovpn_socket *ovpn_sock; rcu_read_lock(); ovpn_sock = rcu_dereference_sk_user_data(sock->sk); if (ovpn_sock && ovpn_sock->peer && !skb_queue_empty(&ovpn_sock->peer->tcp.user_queue)) mask |= EPOLLIN | EPOLLRDNORM; rcu_read_unlock(); return mask; } static void ovpn_tcp_build_protos(struct proto *new_prot, struct proto_ops *new_ops, const struct proto *orig_prot, const struct proto_ops *orig_ops) { memcpy(new_prot, orig_prot, sizeof(*new_prot)); memcpy(new_ops, orig_ops, sizeof(*new_ops)); new_prot->recvmsg = ovpn_tcp_recvmsg; new_prot->sendmsg = ovpn_tcp_sendmsg; new_prot->disconnect = ovpn_tcp_disconnect; new_prot->close = ovpn_tcp_close; new_prot->release_cb = ovpn_tcp_release; new_ops->poll = ovpn_tcp_poll; } /* Initialize TCP static objects */ void __init ovpn_tcp_init(void) { ovpn_tcp_build_protos(&ovpn_tcp_prot, &ovpn_tcp_ops, &tcp_prot, &inet_stream_ops); #if IS_ENABLED(CONFIG_IPV6) ovpn_tcp_build_protos(&ovpn_tcp6_prot, &ovpn_tcp6_ops, &tcpv6_prot, &inet6_stream_ops); #endif }