diff options
Diffstat (limited to 'net/packet/af_packet.c')
| -rw-r--r-- | net/packet/af_packet.c | 222 |
1 files changed, 89 insertions, 133 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c131e5ceea37..494d628d10a5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); -static void prb_retire_rx_blk_timer_expired(struct timer_list *); -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) return proto; } -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - del_timer_sync(&pkc->retire_blk_timer); -} - static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - - spin_lock_bh(&rb_queue->lock); - pkc->delete_blk_timer = 1; - spin_unlock_bh(&rb_queue->lock); - - prb_del_retire_blk_timer(pkc); -} - -static void prb_setup_retire_blk_timer(struct packet_sock *po) -{ - struct tpacket_kbdq_core *pkc; - - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, - 0); - pkc->retire_blk_timer.expires = jiffies; + hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, @@ -669,60 +648,39 @@ static void init_prb_bdqc(struct packet_sock *po, p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; - p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, - req_u->req3.tp_block_size); - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, + req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po); + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, + HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } -/* Do NOT update the last_blk_num first. - * Assumes sk_buff_head lock is held. - */ -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - mod_timer(&pkc->retire_blk_timer, - jiffies + pkc->tov_in_jiffies); - pkc->last_kactive_blk_num = pkc->kactive_blk_num; -} - /* - * Timer logic: - * 1) We refresh the timer only when we open a block. - * By doing this we don't waste cycles refreshing the timer - * on packet-by-packet basis. - * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * - * So, if the user sets the 'tmo' to 10ms then the timer - * will never fire while the block is still getting filled - * (which is what we want). However, the user could choose - * to close a block early and that's fine. - * - * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. - * */ -static void prb_retire_rx_blk_timer_expired(struct timer_list *t) +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = - from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); + timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; @@ -732,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); - if (unlikely(pkc->delete_blk_timer)) - goto out; - /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() @@ -750,46 +705,31 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) write_unlock(&pkc->blk_fill_in_prog_lock); } - if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { - if (!frozen) { - if (!BLOCK_NUM_PKTS(pbd)) { - /* An empty block. Just refresh the timer. */ - goto refresh_timer; - } + if (!frozen) { + if (BLOCK_NUM_PKTS(pbd)) { + /* Not an empty block. Need retire the block. */ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); - if (!prb_dispatch_next_block(pkc, po)) - goto refresh_timer; - else - goto out; - } else { - /* Case 1. Queue was frozen because user-space was - * lagging behind. + prb_dispatch_next_block(pkc, po); + } + } else { + /* Case 1. Queue was frozen because user-space was + * lagging behind. + */ + if (!prb_curr_blk_in_use(pbd)) { + /* Case 2. queue was frozen,user-space caught up, + * now the link went idle && the timer fired. + * We don't have a block to close.So we open this + * block and restart the timer. + * opening a block thaws the queue,restarts timer + * Thawing/timer-refresh is a side effect. */ - if (prb_curr_blk_in_use(pbd)) { - /* - * Ok, user-space is still behind. - * So just refresh the timer. - */ - goto refresh_timer; - } else { - /* Case 2. queue was frozen,user-space caught up, - * now the link went idle && the timer fired. - * We don't have a block to close.So we open this - * block and restart the timer. - * opening a block thaws the queue,restarts timer - * Thawing/timer-refresh is a side effect. - */ - prb_open_block(pkc, pbd); - goto out; - } + prb_open_block(pkc, pbd); } } -refresh_timer: - _prb_refresh_rx_retire_blk_timer(pkc); - -out: + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); + return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, @@ -883,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) } /* - * Side effect of opening a block: + * prb_open_block is called by tpacket_rcv or timer callback. * - * 1) prb_queue is thawed. - * 2) retire_blk_timer is refreshed. + * Reasons why NOT update hrtimer in prb_open_block: + * 1) It will increase complexity to distinguish the two caller scenario. + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * + * One side effect of NOT update hrtimer when called by tpacket_rcv is that + * a newly opened block triggered by tpacket_rcv may be retired earlier than + * expected. On the other hand, if timeout is updated in prb_open_block, the + * frequent reception of network packets that leads to prb_open_block being + * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) @@ -921,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); - _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } @@ -2102,8 +2048,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); @@ -2265,7 +2211,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: @@ -2634,8 +2580,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); + skb->priority = sockc->priority; + skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); @@ -2785,7 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; - long timeo = 0; + long timeo; mutex_lock(&po->pg_vec_lock); @@ -2839,22 +2785,28 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; + timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { - if (need_wait && skb) { - timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); + /* Note: packet_read_pending() might be slow if we + * have to call it as it's per_cpu variable, but in + * fast-path we don't have to call it, only when ph + * is NULL, we need to check the pending_refcnt. + */ + if (need_wait && packet_read_pending(&po->tx_ring)) { timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } - } - /* check for additional frames */ - continue; + /* check for additional frames */ + continue; + } else + break; } skb = NULL; @@ -2943,14 +2895,7 @@ tpacket_error: } packet_increment_head(&po->tx_ring); len_sum += tp_len; - } while (likely((ph != NULL) || - /* Note: packet_read_pending() might be slow if we have - * to call it as it's per_cpu variable, but in fast-path - * we already short-circuit the loop with the first - * condition, and luckily don't have to go that path - * anyway. - */ - (need_wait && packet_read_pending(&po->tx_ring)))); + } while (1); err = len_sum; goto out_put; @@ -3039,7 +2984,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -3335,11 +3279,12 @@ out_unlock: * Bind a packet socket to a device */ -static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, +static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data_min) + 1]; + struct sockaddr *sa = (struct sockaddr *)uaddr; + char name[sizeof(sa->sa_data) + 1]; /* * Check legality @@ -3350,13 +3295,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); - name[sizeof(uaddr->sa_data_min)] = 0; + memcpy(name, sa->sa_data, sizeof(sa->sa_data)); + name[sizeof(sa->sa_data)] = 0; return packet_do_bind(sk, name, 0, 0); } -static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; @@ -3636,11 +3581,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); @@ -3714,15 +3659,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, } static void packet_dev_mclist_delete(struct net_device *dev, - struct packet_mclist **mlp) + struct packet_mclist **mlp, + struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { - packet_dev_mc(dev, ml, -1); + list_add(&ml->remove_list, list); *mlp = ml->next; - kfree(ml); } else mlp = &ml->next; } @@ -3770,6 +3715,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; + INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); @@ -4234,9 +4180,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { - struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct packet_mclist *ml, *tmp; + LIST_HEAD(mclist); + struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { @@ -4245,7 +4193,8 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist_delete(dev, &po->mclist); + packet_dev_mclist_delete(dev, &po->mclist, + &mclist); fallthrough; case NETDEV_DOWN: @@ -4278,6 +4227,13 @@ static int packet_notifier(struct notifier_block *this, } } rcu_read_unlock(); + + /* packet_dev_mc might grab instance locks so can't run under rcu */ + list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + return NOTIFY_DONE; } @@ -4564,10 +4520,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; - if (was_running) { - WRITE_ONCE(po->num, 0); + WRITE_ONCE(po->num, 0); + if (was_running) __unregister_prot_hook(sk, false); - } + spin_unlock(&po->bind_lock); synchronize_net(); @@ -4599,10 +4555,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); - if (was_running) { - WRITE_ONCE(po->num, num); + WRITE_ONCE(po->num, num); + if (was_running) register_prot_hook(sk); - } + spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ @@ -4773,7 +4729,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), + from_kuid_munged(seq_user_ns(seq), sk_uid(s)), sock_i_ino(s)); } |
