summaryrefslogtreecommitdiff
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c240
1 files changed, 149 insertions, 91 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 9419c5cf4de5..53c1d41fb1c9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -43,13 +44,6 @@
* Chetan Loke : Implemented TPACKET_V3 block abstraction
* layer.
* Copyright (C) 2011, <lokec@ccs.neu.edu>
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/types.h>
@@ -275,24 +269,22 @@ static bool packet_use_direct_xmit(const struct packet_sock *po)
return po->xmit == packet_direct_xmit;
}
-static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev)
-{
- return dev_pick_tx_cpu_id(dev, skb, sb_dev, NULL);
-}
-
static u16 packet_pick_tx_queue(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
const struct net_device_ops *ops = dev->netdev_ops;
+ int cpu = raw_smp_processor_id();
u16 queue_index;
+#ifdef CONFIG_XPS
+ skb->sender_cpu = cpu + 1;
+#endif
+ skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
if (ops->ndo_select_queue) {
- queue_index = ops->ndo_select_queue(dev, skb, NULL,
- __packet_pick_tx_queue);
+ queue_index = ops->ndo_select_queue(dev, skb, NULL);
queue_index = netdev_cap_txqueue(dev, queue_index);
} else {
- queue_index = __packet_pick_tx_queue(dev, skb, NULL);
+ queue_index = netdev_pick_tx(dev, skb, NULL);
}
return queue_index;
@@ -392,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
smp_wmb();
}
-static int __packet_get_status(struct packet_sock *po, void *frame)
+static int __packet_get_status(const struct packet_sock *po, void *frame)
{
union tpacket_uhdr h;
@@ -468,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
return ts_status;
}
-static void *packet_lookup_frame(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int position,
- int status)
+static void *packet_lookup_frame(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int position,
+ int status)
{
unsigned int pg_vec_pos, frame_offset;
union tpacket_uhdr h;
@@ -766,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
struct sock *sk = &po->sk;
- if (po->stats.stats3.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
@@ -1011,7 +1003,6 @@ static void prb_fill_curr_block(char *curr,
/* Assumes caller has the sk->rx_queue.lock */
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
struct sk_buff *skb,
- int status,
unsigned int len
)
{
@@ -1083,7 +1074,7 @@ static void *packet_current_rx_frame(struct packet_sock *po,
po->rx_ring.head, status);
return curr;
case TPACKET_V3:
- return __packet_lookup_frame_in_block(po, skb, status, len);
+ return __packet_lookup_frame_in_block(po, skb, len);
default:
WARN(1, "TPACKET version not supported\n");
BUG();
@@ -1091,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po,
}
}
-static void *prb_lookup_block(struct packet_sock *po,
- struct packet_ring_buffer *rb,
- unsigned int idx,
- int status)
+static void *prb_lookup_block(const struct packet_sock *po,
+ const struct packet_ring_buffer *rb,
+ unsigned int idx,
+ int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
@@ -1207,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po)
#define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2
-static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.frame_max + 1;
- idx = po->rx_ring.head;
+ len = READ_ONCE(po->rx_ring.frame_max) + 1;
+ idx = READ_ONCE(po->rx_ring.head);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1220,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
- len = po->rx_ring.prb_bdqc.knum_blocks;
- idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
+ idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
@@ -1233,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
-static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+static int __packet_rcv_has_room(const struct packet_sock *po,
+ const struct sk_buff *skb)
{
- struct sock *sk = &po->sk;
+ const struct sock *sk = &po->sk;
int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) {
- int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- - (skb ? skb->truesize : 0);
- if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - (skb ? skb->truesize : 0);
+
+ if (avail > (rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL;
else if (avail > 0)
return ROOM_LOW;
@@ -1266,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
- int ret;
- bool has_room;
+ int pressure, ret;
- spin_lock_bh(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb);
- has_room = ret == ROOM_NORMAL;
- if (po->pressure == has_room)
- po->pressure = !has_room;
- spin_unlock_bh(&po->sk.sk_receive_queue.lock);
+ pressure = ret != ROOM_NORMAL;
+
+ if (READ_ONCE(po->pressure) != pressure)
+ WRITE_ONCE(po->pressure, pressure);
return ret;
}
+static void packet_rcv_try_clear_pressure(struct packet_sock *po)
+{
+ if (READ_ONCE(po->pressure) &&
+ __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
+ WRITE_ONCE(po->pressure, 0);
+}
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1296,15 +1295,21 @@ static void packet_sock_destruct(struct sock *sk)
static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
{
- u32 rxhash;
+ u32 *history = po->rollover->history;
+ u32 victim, rxhash;
int i, count = 0;
rxhash = skb_get_hash(skb);
for (i = 0; i < ROLLOVER_HLEN; i++)
- if (po->rollover->history[i] == rxhash)
+ if (READ_ONCE(history[i]) == rxhash)
count++;
- po->rollover->history[prandom_u32() % ROLLOVER_HLEN] = rxhash;
+ victim = prandom_u32() % ROLLOVER_HLEN;
+
+ /* Avoid dirtying the cache line if possible */
+ if (READ_ONCE(history[victim]) != rxhash)
+ WRITE_ONCE(history[victim], rxhash);
+
return count > (ROLLOVER_HLEN >> 1);
}
@@ -1359,7 +1364,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1);
do {
po_next = pkt_sk(f->arr[i]);
- if (po_next != po_skip && !po_next->pressure &&
+ if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
@@ -1822,7 +1827,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
skb_dst_drop(skb);
/* drop conntrack reference */
- nf_reset(skb);
+ nf_reset_ct(skb);
spkt = &PACKET_SKB_CB(skb)->sa.pkt;
@@ -2122,7 +2127,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
skb_dst_drop(skb);
/* drop conntrack reference */
- nf_reset(skb);
+ nf_reset_ct(skb);
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_packets++;
@@ -2134,10 +2139,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
is_drop_n_account = true;
- spin_lock(&sk->sk_receive_queue.lock);
- po->stats.stats1.tp_drops++;
+ atomic_inc(&po->tp_drops);
atomic_inc(&sk->sk_drops);
- spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
@@ -2201,6 +2204,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!res)
goto drop_n_restore;
+ /* If we are flooded, just give up */
+ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
+ atomic_inc(&po->tp_drops);
+ goto drop_n_restore;
+ }
+
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
@@ -2271,7 +2280,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level.
*/
- if (po->stats.stats1.tp_drops)
+ if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
@@ -2387,9 +2396,9 @@ drop:
return 0;
drop_n_account:
- is_drop_n_account = true;
- po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
+ atomic_inc(&po->tp_drops);
+ is_drop_n_account = true;
sk->sk_data_ready(sk);
kfree_skb(copy_skb);
@@ -2409,6 +2418,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
ts = __packet_set_timestamp(po, ph, skb);
__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+
+ if (!packet_read_pending(&po->tx_ring))
+ complete(&po->skb_completion);
}
sock_wfree(skb);
@@ -2593,7 +2605,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
struct net_device *dev;
struct virtio_net_hdr *vnet_hdr = NULL;
struct sockcm_cookie sockc;
@@ -2602,19 +2614,26 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
void *ph;
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+ unsigned char *addr = NULL;
int tp_len, size_max;
- unsigned char *addr;
void *data;
int len_sum = 0;
int status = TP_STATUS_AVAILABLE;
int hlen, tlen, copylen = 0;
+ long timeo = 0;
mutex_lock(&po->pg_vec_lock);
+ /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
+ * we need to confirm it under protection of pg_vec_lock.
+ */
+ if (unlikely(!po->tx_ring.pg_vec)) {
+ err = -EBUSY;
+ goto out;
+ }
if (likely(saddr == NULL)) {
dev = packet_cached_dev_get(po);
proto = po->num;
- addr = NULL;
} else {
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
@@ -2624,10 +2643,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
sll_addr)))
goto out;
proto = saddr->sll_protocol;
- addr = saddr->sll_halen ? saddr->sll_addr : NULL;
dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
- if (addr && dev && saddr->sll_halen < dev->addr_len)
- goto out_put;
+ if (po->sk.sk_socket->type == SOCK_DGRAM) {
+ if (dev && msg->msg_namelen < dev->addr_len +
+ offsetof(struct sockaddr_ll, sll_addr))
+ goto out_put;
+ addr = saddr->sll_addr;
+ }
}
err = -ENXIO;
@@ -2652,12 +2674,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
size_max = dev->mtu + reserve + VLAN_HLEN;
+ reinit_completion(&po->skb_completion);
+
do {
ph = packet_current_frame(po, &po->tx_ring,
TP_STATUS_SEND_REQUEST);
if (unlikely(ph == NULL)) {
- if (need_wait && need_resched())
- schedule();
+ if (need_wait && skb) {
+ timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
+ timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
+ if (timeo <= 0) {
+ err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
+ goto out_put;
+ }
+ }
+ /* check for additional frames */
continue;
}
@@ -2799,7 +2830,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
struct sk_buff *skb;
struct net_device *dev;
__be16 proto;
- unsigned char *addr;
+ unsigned char *addr = NULL;
int err, reserve = 0;
struct sockcm_cookie sockc;
struct virtio_net_hdr vnet_hdr = { 0 };
@@ -2816,7 +2847,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (likely(saddr == NULL)) {
dev = packet_cached_dev_get(po);
proto = po->num;
- addr = NULL;
} else {
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_ll))
@@ -2824,10 +2854,13 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
goto out;
proto = saddr->sll_protocol;
- addr = saddr->sll_halen ? saddr->sll_addr : NULL;
dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
- if (addr && dev && saddr->sll_halen < dev->addr_len)
- goto out_unlock;
+ if (sock->type == SOCK_DGRAM) {
+ if (dev && msg->msg_namelen < dev->addr_len +
+ offsetof(struct sockaddr_ll, sll_addr))
+ goto out_unlock;
+ addr = saddr->sll_addr;
+ }
}
err = -ENXIO;
@@ -3012,8 +3045,8 @@ static int packet_release(struct socket *sock)
synchronize_net();
+ kfree(po->rollover);
if (f) {
- kfree(po->rollover);
fanout_release_data(f);
kfree(f);
}
@@ -3211,6 +3244,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
sock_init_data(sock, sk);
po = pkt_sk(sk);
+ init_completion(&po->skb_completion);
sk->sk_family = PF_PACKET;
po->num = proto;
po->xmit = dev_queue_xmit;
@@ -3308,8 +3342,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL)
goto out;
- if (pkt_sk(sk)->pressure)
- packet_rcv_has_room(pkt_sk(sk), NULL);
+ packet_rcv_try_clear_pressure(pkt_sk(sk));
if (pkt_sk(sk)->has_vnet_hdr) {
err = packet_rcv_vnet(msg, skb, &len);
@@ -3344,20 +3377,29 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
sock_recv_ts_and_drops(msg, sk, skb);
if (msg->msg_name) {
+ int copy_len;
+
/* If the address length field is there to be filled
* in, we fill it in now.
*/
if (sock->type == SOCK_PACKET) {
__sockaddr_check_size(sizeof(struct sockaddr_pkt));
msg->msg_namelen = sizeof(struct sockaddr_pkt);
+ copy_len = msg->msg_namelen;
} else {
struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
msg->msg_namelen = sll->sll_halen +
offsetof(struct sockaddr_ll, sll_addr);
+ copy_len = msg->msg_namelen;
+ if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
+ memset(msg->msg_name +
+ offsetof(struct sockaddr_ll, sll_addr),
+ 0, sizeof(sll->sll_addr));
+ msg->msg_namelen = sizeof(struct sockaddr_ll);
+ }
}
- memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
- msg->msg_namelen);
+ memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
}
if (pkt_sk(sk)->auxdata) {
@@ -3872,6 +3914,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
void *data = &val;
union tpacket_stats_u st;
struct tpacket_rollover_stats rstats;
+ int drops;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
@@ -3888,14 +3931,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
memcpy(&st, &po->stats, sizeof(st));
memset(&po->stats, 0, sizeof(po->stats));
spin_unlock_bh(&sk->sk_receive_queue.lock);
+ drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3);
- st.stats3.tp_packets += st.stats3.tp_drops;
+ st.stats3.tp_drops = drops;
+ st.stats3.tp_packets += drops;
data = &st.stats3;
} else {
lv = sizeof(struct tpacket_stats);
- st.stats1.tp_packets += st.stats1.tp_drops;
+ st.stats1.tp_drops = drops;
+ st.stats1.tp_packets += drops;
data = &st.stats1;
}
@@ -4077,11 +4123,6 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
}
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
-
#ifdef CONFIG_INET
case SIOCADDRT:
case SIOCDELRT:
@@ -4119,8 +4160,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= EPOLLIN | EPOLLRDNORM;
}
- if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
- po->pressure = 0;
+ packet_rcv_try_clear_pressure(po);
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
@@ -4314,7 +4354,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
req3->tp_sizeof_priv ||
req3->tp_feature_req_word) {
err = -EINVAL;
- goto out;
+ goto out_free_pg_vec;
}
}
break;
@@ -4378,6 +4418,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
prb_shutdown_retire_blk_timer(po, rb_queue);
}
+out_free_pg_vec:
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
@@ -4457,6 +4498,7 @@ static const struct proto_ops packet_ops_spkt = {
.getname = packet_getname_spkt,
.poll = datagram_poll,
.ioctl = packet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_no_setsockopt,
@@ -4478,6 +4520,7 @@ static const struct proto_ops packet_ops = {
.getname = packet_getname,
.poll = packet_poll,
.ioctl = packet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = packet_setsockopt,
@@ -4590,14 +4633,29 @@ static void __exit packet_exit(void)
static int __init packet_init(void)
{
- int rc = proto_register(&packet_proto, 0);
+ int rc;
- if (rc != 0)
+ rc = proto_register(&packet_proto, 0);
+ if (rc)
goto out;
+ rc = sock_register(&packet_family_ops);
+ if (rc)
+ goto out_proto;
+ rc = register_pernet_subsys(&packet_net_ops);
+ if (rc)
+ goto out_sock;
+ rc = register_netdevice_notifier(&packet_netdev_notifier);
+ if (rc)
+ goto out_pernet;
+
+ return 0;
- sock_register(&packet_family_ops);
- register_pernet_subsys(&packet_net_ops);
- register_netdevice_notifier(&packet_netdev_notifier);
+out_pernet:
+ unregister_pernet_subsys(&packet_net_ops);
+out_sock:
+ sock_unregister(PF_PACKET);
+out_proto:
+ proto_unregister(&packet_proto);
out:
return rc;
}